From e53e87b39ecf899c37bc0ced3e3c9dc6ad20834f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 19 May 2026 14:08:13 +0000 Subject: [PATCH] Add A2/A3/A4 gap tests; pin GroupChat duplicate-key bug discovered by A2 Agent-Logs-Url: https://github.com/microsoft/agent-framework/sessions/153b10f3-2555-45fe-b264-42a2e9100cf2 Co-authored-by: lokitoth <6936551+lokitoth@users.noreply.github.com> --- .../issue-5350-root-cause-validation-plan.md | 22 +- ...ToolApprovalRequestCheckpointReproTests.cs | 305 ++++++++++++++++++ 2 files changed, 315 insertions(+), 12 deletions(-) diff --git a/docs/working/issue-5350-root-cause-validation-plan.md b/docs/working/issue-5350-root-cause-validation-plan.md index 61360ba36a..3f1469f50b 100644 --- a/docs/working/issue-5350-root-cause-validation-plan.md +++ b/docs/working/issue-5350-root-cause-validation-plan.md @@ -75,19 +75,17 @@ the OP's reported scenario, varying one dimension at a time. ### Track A — Reproduce in a configuration closer to the OP's pattern "B" -The OP's repro path differs from the new tests in three concrete ways. Test #7 (the -maximal repro added to this PR) closes the biggest gap — it uses a real -`ChatClientAgent` + `FunctionInvokingChatClient` + `ApprovalRequiredAIFunction` and -still passes — but the remaining differences are still worth isolating. Each row below -is "add a test that reproduces the OP's symptom (`postResume.ToolCall is not -FunctionCallContent`)". +The OP's repro path differs from the new tests in three concrete ways. Tests #7 +(maximal repro) and #8–#10 (A2 / A3 / A4 below) close all four gaps. The OP's +specific hypothesis (`TARC.ToolCall is not FunctionCallContent` after resume) does +not reproduce in any of them; A2 did however uncover a *separate, unrelated* bug. -| Step | Variable that changes vs. the passing tests in this repo | Why it matters | Pass criteria | -|------|--------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------| -| A1 | ~~Use `ChatClientAgent` + `ApprovalRequiredAIFunction` bound into a `WorkflowBuilder`~~ — **covered by test #7 in this PR; passes.** | Was the largest gap to the OP's repro. Closed. | N/A — already passing. | -| A2 | Multi-agent variant: same as #7 but with the agent inside `GroupChatBuilder` (the OP's actual orchestration), to rule out a group-chat-specific re-wrap or replay path that drops the TARC.ToolCall type. | If group chat re-encodes TARC as part of `ChatMessage.Contents` (`AIContent` polymorphism is two-deep through `ToolApprovalRequestContent`), one branch may resolve and the other not. | Test fails → root cause is in the group-chat message round-trip, not the marshaller. | -| A3 | Same as #7 but with the JSON `JsonElement` round-tripped through `string` + `JsonDocument.Parse` between commit and retrieve (i.e. emulating the SQL `nvarchar` hop in the OP's Dapper store). | The OP uses Dapper + SQL Server. If the column / driver round-trip preserves ordering, this should be identity-preserving — but if it reorders metadata properties, the `$type` discriminator can be moved out of first position, which then requires `AllowOutOfOrderMetadataProperties = true`. Note `JsonMarshaller` only propagates that one flag from the user's `customOptions`. | Test fails when ordering is permuted before deserialization. Confirms the storage layer is reordering metadata. | -| A4 | Same as #7 but with a non-default `JsonSerializerOptions` passed as `customOptions` to `CheckpointManager.CreateJson`, where the user-supplied options do **not** include the polymorphism resolver and `JsonMarshaller`'s `LookupTypeInfo` falls back to them for some type. | `JsonMarshaller.LookupTypeInfo` only goes to the external options when the internal chain doesn't know about the type. For most cases this won't trigger, but it's worth confirming that supplying a custom `JsonSerializerOptions` does not silently displace the internal chain. | Either the external options are never consulted for `AIContent` (good), or there is a sneak path where they are (regression). | +| Step | Variable that changes vs. the passing tests in this repo | Why it matters | Outcome | +|------|--------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------| +| A1 | ~~Use `ChatClientAgent` + `ApprovalRequiredAIFunction` bound into a `WorkflowBuilder`~~ — **covered by test #7 in this PR; passes.** | Was the largest gap to the OP's repro. Closed. | OP hypothesis disproved — `TARC.ToolCall is FunctionCallContent` post-resume, and the tool is actually invoked exactly once when the approval response is sent. | +| A2 | Multi-agent variant: same as #7 but with the agent inside a `GroupChatBuilder` (the OP's actual orchestration) with `RoundRobinGroupChatManager`. | If group chat re-encodes TARC as part of `ChatMessage.Contents` (`AIContent` polymorphism is two-deep through `ToolApprovalRequestContent`), one branch may resolve and the other not. | OP hypothesis disproved — `TARC.ToolCall is FunctionCallContent` post-resume. **But:** sending the approval response surfaces a *different* real bug — `FunctionInvokingChatClient.ExtractAndRemoveApprovalRequestsAndResponses` throws `ArgumentException: An item with the same key has already been added. Key: ficc_call-1`. Pinned by test as documented misbehavior. | +| A3 | Same as #7 but with the checkpoint `JsonElement` round-tripped through `string` + `JsonDocument.Parse` between commit and retrieve (`StringRoundTripJsonStore`), emulating the SQL `nvarchar` hop in the OP's Dapper store. | The OP uses Dapper + SQL Server. If the column / driver round-trip preserves ordering, this should be identity-preserving — but if it reorders metadata properties, the `$type` discriminator can be moved out of first position, which then requires `AllowOutOfOrderMetadataProperties = true`. | OP hypothesis disproved — byte-preserving string round-trip is identity-preserving for the relevant payload; `TARC.ToolCall is FunctionCallContent` post-resume; tool invoked exactly once. The OP's storage layer would have to *perturb* the JSON (e.g. reorder metadata) for this to reproduce. | +| A4 | Same as #7 but with non-default `JsonSerializerOptions` (`JsonSerializerDefaults.Web`, no AIJsonUtilities resolver) passed as `customOptions` to `CheckpointManager.CreateJson`. | `JsonMarshaller.LookupTypeInfo` only goes to the external options when the internal chain doesn't know about the type. For most cases this won't trigger, but it's worth confirming that supplying a custom `JsonSerializerOptions` does not silently displace the internal chain. | OP hypothesis disproved — custom external options that DO NOT know about `AIContent` types are correctly ignored for known types; the internal `WorkflowsJsonUtilities.DefaultOptions` chain wins. `TARC.ToolCall is FunctionCallContent` post-resume; tool invoked exactly once. | ### Track B — Validate the wire format the OP actually persists diff --git a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/ToolApprovalRequestCheckpointReproTests.cs b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/ToolApprovalRequestCheckpointReproTests.cs index 331c3cc798..a3a2b2ce9b 100644 --- a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/ToolApprovalRequestCheckpointReproTests.cs +++ b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/ToolApprovalRequestCheckpointReproTests.cs @@ -485,4 +485,309 @@ public class ToolApprovalRequestCheckpointReproTests public object? GetService(Type serviceType, object? serviceKey = null) => null; public void Dispose() { } } + + /// + /// Track A2 — same as the maximal repro (test #7) but with the agent inside a + /// GroupChatBuilder (round-robin manager with a single participant), to rule out + /// a group-chat-specific re-wrap or replay path that drops the + /// type during the checkpoint round-trip. + /// + /// Finding: the type IS preserved after + /// resume (consistent with tests #1 – #7 — the OP's hypothesis still does not reproduce + /// in this configuration). However, sending an approval response back into the resumed + /// group-chat workflow surfaces a different, real bug: + /// FunctionInvokingChatClient.ExtractAndRemoveApprovalRequestsAndResponses throws + /// with message + /// "An item with the same key has already been added. Key: ficc_call-1". + /// This test pins that observed behavior so future investigation can latch onto it. + /// + [Fact] + public async Task Repro_5350_A2_GroupChatBuilder_WithApprovalRequiredTool_JsonCheckpointResume_PreservesFunctionCallContentAsync() + { + ReproHarness harness = new(); + Workflow workflow = AgentWorkflowBuilder + .CreateGroupChatBuilderWith(agents => new RoundRobinGroupChatManager(agents) { MaximumIterationCount = 4 }) + .AddParticipants(harness.Agent) + .Build(); + + await RunReproAsync( + workflow, + harness, + CheckpointManager.CreateJson(new InMemoryJsonStore()), + scenarioName: "A2 (group chat)", + expectCleanPostApprovalCompletion: false, + postApprovalCompletionAssertion: events => + { + // A2 currently surfaces a duplicate-key crash inside FICC when the group-chat + // workflow resumes with a pending approval request and the response arrives. + // This assertion pins the observed misbehavior. If the upstream + // GroupChat + AIAgentHostExecutor + FICC interaction is fixed, this test will + // start failing here — that's intentional. The fix should then either remove + // this assertion or convert it to a clean-completion assertion mirroring + // test #7. TODO(#5350-followup): file a separate issue for the duplicate-key + // crash if one does not already exist, and link it here. + ArgumentException? duplicateKey = events + .OfType() + .Select(e => e.Data as Exception) + .Select(e => e?.InnerException as ArgumentException ?? e as ArgumentException) + .FirstOrDefault(e => e?.Message.Contains("ficc_call-1", StringComparison.Ordinal) == true); + duplicateKey.Should().NotBeNull( + "[A2 (group chat)] approving the resumed approval request currently surfaces a duplicate-key " + + "ArgumentException out of FunctionInvokingChatClient.ExtractAndRemoveApprovalRequestsAndResponses — " + + "this is a real bug, but it is NOT the bug claimed in issue #5350. The TARC.ToolCall was already " + + "asserted to be a FunctionCallContent above; the OP's hypothesis remains unreproduced."); + }); + } + + /// + /// Track A3 — same as the maximal repro (test #7) but the checkpoint + /// is round-tripped through + + /// between commit and retrieve, to emulate the SQL nvarchar / Dapper hop in the + /// OP's setup. This catches cases where the storage layer would only fail if it perturbed + /// the JSON in some way (e.g. dropped metadata, re-encoded numbers), and confirms a + /// byte-preserving string round-trip on its own is harmless. + /// + [Fact] + public async Task Repro_5350_A3_StringRoundTripStore_PreservesFunctionCallContentAsync() + { + ReproHarness harness = new(); + Workflow workflow = new WorkflowBuilder(harness.Agent).Build(); + + CheckpointManager checkpointManager = CheckpointManager.CreateJson( + new StringRoundTripJsonStore(new InMemoryJsonStore())); + + await RunReproAsync( + workflow, + harness, + checkpointManager, + scenarioName: "A3 (string round-trip store)"); + } + + /// + /// Track A4 — same as the maximal repro (test #7) but with a non-default + /// passed as customOptions to + /// . The custom options intentionally do NOT + /// include the polymorphism resolver chain (AgentAbstractionsJsonUtilities → + /// AIJsonUtilities) — confirming that JsonMarshaller's internal + /// WorkflowsJsonUtilities.DefaultOptions chain always wins for known + /// types and the external options cannot silently displace it. + /// + [Fact] + public async Task Repro_5350_A4_CustomJsonSerializerOptions_PreservesFunctionCallContentAsync() + { + ReproHarness harness = new(); + Workflow workflow = new WorkflowBuilder(harness.Agent).Build(); + + // Custom options that DO NOT know about AIContent / FunctionCallContent. + JsonSerializerOptions customOptions = new(JsonSerializerDefaults.Web) + { + WriteIndented = true, + }; + + CheckpointManager checkpointManager = CheckpointManager.CreateJson( + new InMemoryJsonStore(), + customOptions); + + await RunReproAsync( + workflow, + harness, + checkpointManager, + scenarioName: "A4 (custom JsonSerializerOptions)"); + } + + /// + /// Shared end-to-end repro driver used by tests #7, A2, A3, A4. + /// Drives the workflow until an approval request appears, captures the latest checkpoint, + /// disposes the run, resumes from the checkpoint, and asserts that the resumed + /// 's is + /// still a . Then sends an approval response and asserts + /// the wrapped is invoked exactly once. + /// + private static async Task RunReproAsync( + Workflow workflow, + ReproHarness harness, + CheckpointManager checkpointManager, + string scenarioName, + bool expectCleanPostApprovalCompletion = true, + Action>? postApprovalCompletionAssertion = null) + { + InProcessExecutionEnvironment env = InProcessExecution.OffThread; + List inputMessages = [new(ChatRole.User, "What's the weather in Amsterdam?")]; + + // Act 1 — run until approval request appears and capture latest checkpoint. + ExternalRequest? firstRunRequest = null; + CheckpointInfo? checkpoint = null; + + await using (StreamingRun firstRun = await env.WithCheckpointing(checkpointManager) + .RunStreamingAsync(workflow, inputMessages)) + { + (await firstRun.TrySendMessageAsync(new TurnToken(emitEvents: false))) + .Should().BeTrue($"[{scenarioName}] the workflow should accept a TurnToken"); + + using CancellationTokenSource cts = new(TimeSpan.FromSeconds(30)); + await foreach (WorkflowEvent evt in firstRun.WatchStreamAsync(blockOnPendingRequest: false, cts.Token)) + { + if (evt is RequestInfoEvent requestInfo) + { + firstRunRequest ??= requestInfo.Request; + } + + if (evt is SuperStepCompletedEvent step && step.CompletionInfo?.Checkpoint is { } cp) + { + checkpoint = cp; + } + } + } + + firstRunRequest.Should().NotBeNull( + $"[{scenarioName}] the ChatClientAgent + FICC pipeline should surface the approval request as a workflow RequestInfoEvent"); + checkpoint.Should().NotBeNull( + $"[{scenarioName}] a checkpoint should have been produced while the approval request was pending"); + harness.ChatCallCount.Should().Be(1, $"[{scenarioName}] the mock chat client should have been called exactly once before approval was requested"); + harness.InvocationCount.Should().Be(0, $"[{scenarioName}] the underlying tool must NOT have been invoked before approval was granted"); + + ToolApprovalRequestContent? preCheckpoint = firstRunRequest!.Data.As(); + preCheckpoint.Should().NotBeNull($"[{scenarioName}] the pending external request should carry a ToolApprovalRequestContent payload"); + preCheckpoint!.ToolCall.Should().BeOfType( + $"[{scenarioName}] the pre-checkpoint pending request payload must already be a FunctionCallContent"); + + // Act 2 — resume from checkpoint on a fresh handle. + ExternalRequest? resumedRequest = null; + List postResumeEvents = []; + + await using (StreamingRun resumed = await env.WithCheckpointing(checkpointManager) + .ResumeStreamingAsync(workflow, checkpoint!)) + { + using CancellationTokenSource cts = new(TimeSpan.FromSeconds(30)); + await foreach (WorkflowEvent evt in resumed.WatchStreamAsync(blockOnPendingRequest: false, cts.Token)) + { + if (evt is RequestInfoEvent requestInfo) + { + resumedRequest ??= requestInfo.Request; + } + } + + resumedRequest.Should().NotBeNull( + $"[{scenarioName}] the resumed workflow should re-emit the pending approval RequestInfoEvent"); + + ToolApprovalRequestContent? postResume = resumedRequest!.Data.As(); + postResume.Should().NotBeNull( + $"[{scenarioName}] ExternalRequest.Data.As() should materialize the payload after JSON-checkpoint resume"); + postResume!.ToolCall.Should().NotBeNull($"[{scenarioName}] the resumed TARC must carry its ToolCall"); + postResume.ToolCall.Should().BeOfType( + $"[{scenarioName}] after CheckpointManager.CreateJson round-trip via ResumeStreamingAsync, " + + "ToolApprovalRequestContent.ToolCall must still be a FunctionCallContent so that " + + "FunctionInvokingChatClient's pattern match (`tarc.ToolCall is FunctionCallContent`) continues to fire (issue #5350)."); + + // Act 3 — approve and finish the workflow. + ToolApprovalResponseContent approvalResponse = postResume.CreateResponse(approved: true); + await resumed.SendResponseAsync(resumedRequest.CreateResponse(approvalResponse)); + + using CancellationTokenSource cts2 = new(TimeSpan.FromSeconds(30)); + await foreach (WorkflowEvent evt in resumed.WatchStreamAsync(blockOnPendingRequest: false, cts2.Token)) + { + postResumeEvents.Add(evt); + } + } + + if (expectCleanPostApprovalCompletion) + { + harness.InvocationCount.Should().Be(1, + $"[{scenarioName}] approving the request should cause FunctionInvokingChatClient to invoke the wrapped AIFunction exactly once"); + postResumeEvents.OfType().Should().BeEmpty( + $"[{scenarioName}] no workflow errors should be raised when responding to the resumed approval request"); + postResumeEvents.OfType().Should().BeEmpty( + $"[{scenarioName}] no executor failures should be raised when responding to the resumed approval request"); + } + + postApprovalCompletionAssertion?.Invoke(postResumeEvents); + } + + /// + /// Bundles a + counting + /// + in a single reusable harness so the four end-to-end repro + /// variants (#7, A2, A3, A4) can share identical setup. + /// + private sealed class ReproHarness + { + public const string ToolName = "GetWeather"; + public const string ToolResultText = "Sunny, 22°C"; + public const string ToolCallId = "call-1"; + public const string FinalAssistantText = "The weather in Amsterdam is sunny and 22°C."; + + private int _invocationCount; + private int _chatCallIndex; + + public int InvocationCount => Volatile.Read(ref this._invocationCount); + public int ChatCallCount => Volatile.Read(ref this._chatCallIndex); + + public ChatClientAgent Agent { get; } + + public ReproHarness() + { + AIFunction underlyingTool = AIFunctionFactory.Create( + ([Description("City to look up")] string city) => + { + Interlocked.Increment(ref this._invocationCount); + return ToolResultText; + }, + name: ToolName, + description: "Gets the weather for the given city"); + + ApprovalRequiredAIFunction approvalTool = new(underlyingTool); + + MockChatClient mockChatClient = new((messages, options) => + { + int index = Interlocked.Increment(ref this._chatCallIndex) - 1; + return index switch + { + 0 => new ChatResponse(new ChatMessage(ChatRole.Assistant, + [new FunctionCallContent( + callId: ToolCallId, + name: ToolName, + arguments: new Dictionary { ["city"] = "Amsterdam" })])), + _ => new ChatResponse(new ChatMessage(ChatRole.Assistant, FinalAssistantText)), + }; + }); + + this.Agent = new ChatClientAgent( + mockChatClient, + instructions: "You are a weather agent.", + name: "WeatherAgent", + tools: [approvalTool]); + } + } + + /// + /// Wraps a and forces every to + /// round-trip through GetRawText() + + /// during commit and again during retrieve. This emulates a Dapper-backed SQL store where + /// the is materialized to a string for the nvarchar(max) + /// column on the way in and re-parsed on the way back out. Used by the A3 test only; + /// the double parse-and-clone is intentionally extra work and not suitable for production use. + /// + private sealed class StringRoundTripJsonStore(JsonCheckpointStore inner) : JsonCheckpointStore + { + public override async ValueTask CreateCheckpointAsync(string sessionId, JsonElement value, CheckpointInfo? parent = null) + { + JsonElement roundTripped = RoundTrip(value); + return await inner.CreateCheckpointAsync(sessionId, roundTripped, parent).ConfigureAwait(false); + } + + public override async ValueTask RetrieveCheckpointAsync(string sessionId, CheckpointInfo key) + { + JsonElement raw = await inner.RetrieveCheckpointAsync(sessionId, key).ConfigureAwait(false); + return RoundTrip(raw); + } + + public override ValueTask> RetrieveIndexAsync(string sessionId, CheckpointInfo? withParent = null) + => inner.RetrieveIndexAsync(sessionId, withParent); + + private static JsonElement RoundTrip(JsonElement element) + { + string raw = element.GetRawText(); + using JsonDocument doc = JsonDocument.Parse(raw); + return doc.RootElement.Clone(); + } + } }