Merge branch 'main' into dev/dotnet_workflow/mark_magentic_experimental

2026-06-16 21:04:09 +08:00 · 2026-05-07 17:11:00 -04:00
parent 6a54af6ac9 d3518ad19d
commit 89f3288c04
11 changed files with 371 additions and 117 deletions
@@ -273,6 +273,8 @@ jobs:
            -c ${{ matrix.configuration }} `
            --no-build -v Normal `
            --report-xunit-trx `
+            --report-junit `
+            --results-directory ../IntegrationTestResults/ `
            --ignore-exit-code 8 `
            --filter-not-trait "Category=IntegrationDisabled" `
            --filter-not-trait "Category=FoundryHostedAgents" `
@@ -294,6 +296,10 @@ jobs:
          AZURE_AI_PROJECT_ENDPOINT: ${{ vars.AZURE_AI_PROJECT_ENDPOINT }}
          AZURE_AI_MODEL_DEPLOYMENT_NAME: ${{ vars.AZURE_AI_MODEL_DEPLOYMENT_NAME }}
          AZURE_AI_BING_CONNECTION_ID: ${{ vars.AZURE_AI_BING_CONNECTION_ID }}
+          # Anthropic Models
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          ANTHROPIC_CHAT_MODEL_NAME: ${{ vars.ANTHROPIC_CHAT_MODEL_NAME }}
+          ANTHROPIC_REASONING_MODEL_NAME: ${{ vars.ANTHROPIC_REASONING_MODEL_NAME }}

      # Generate test reports and check coverage
      - name: Generate test reports
@@ -316,6 +322,14 @@ jobs:
        shell: pwsh
        run: ./dotnet/eng/scripts/dotnet-check-coverage.ps1 -JsonReportPath "TestResults/Reports/Summary.json" -CoverageThreshold $env:COVERAGE_THRESHOLD

+      - name: Upload integration test results
+        if: always() && github.event_name != 'pull_request' && matrix.integration-tests
+        uses: actions/upload-artifact@v7
+        with:
+          name: dotnet-test-results-${{ matrix.targetFramework }}-${{ matrix.os }}
+          path: IntegrationTestResults/**/*.junit
+          if-no-files-found: ignore
+
  # The Foundry hosted-agent IT is costly (it builds a container, pushes to ACR, and provisions
  # live agents on a separate Foundry project). Running it in its own job keeps the overall
  # workflow time roughly flat: it executes in parallel to dotnet-build and dotnet-test and is
@@ -456,3 +470,64 @@ jobs:
        uses: actions/github-script@v8
        with:
          script: core.setFailed('Integration Tests Cancelled!')
+
+  # Integration test trend report (aggregates JUnit XML results from dotnet test jobs)
+  dotnet-integration-test-report:
+    name: Integration Test Report
+    if: >
+      always() &&
+      github.event_name != 'pull_request' &&
+      (contains(join(needs.*.result, ','), 'success') ||
+       contains(join(needs.*.result, ','), 'failure'))
+    needs: [dotnet-test]
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: python
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          persist-credentials: false
+          sparse-checkout: |
+            .github/actions/python-setup
+            python
+      - name: Set up python and install the project
+        uses: ./.github/actions/python-setup
+        with:
+          python-version: "3.13"
+          os: ${{ runner.os }}
+      - name: Download all test results from current run
+        uses: actions/download-artifact@v4
+        with:
+          pattern: dotnet-test-results-*
+          path: dotnet-test-results/
+      - name: Restore report history cache
+        uses: actions/cache/restore@v4
+        with:
+          path: python/dotnet-integration-report-history.json
+          key: dotnet-integration-report-history-${{ github.run_id }}
+          restore-keys: |
+            dotnet-integration-report-history-
+      - name: Generate trend report
+        run: >
+          uv run python scripts/integration_test_report/aggregate.py
+          ../dotnet-test-results/
+          dotnet-integration-report-history.json
+          dotnet-integration-test-report.md
+      - name: Post to Job Summary
+        if: always()
+        run: cat dotnet-integration-test-report.md >> $GITHUB_STEP_SUMMARY
+      - name: Save report history cache
+        if: always()
+        uses: actions/cache/save@v4
+        with:
+          path: python/dotnet-integration-report-history.json
+          key: dotnet-integration-report-history-${{ github.run_id }}
+      - name: Upload trend report
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: dotnet-integration-test-report
+          path: |
+            python/dotnet-integration-test-report.md
+            python/dotnet-integration-report-history.json
@@ -33,3 +33,4 @@ Console.WriteLine(await agent.RunAsync("Write a haiku about Microsoft Agent Fram
 - [Design Documents](../docs/design)
 - [Architectural Decision Records](../docs/decisions)
 - [MSFT Learn Docs](https://learn.microsoft.com/agent-framework/overview/agent-framework-overview)
+
@@ -13,5 +13,5 @@ internal sealed class SequenceNumber
    /// Gets the next sequence number.
    /// </summary>
    /// <returns>The next sequence number.</returns>
-    public int Increment() => this._sequenceNumber++;
+    public int Increment() => System.Threading.Interlocked.Increment(ref this._sequenceNumber) - 1;
 }
@@ -17,9 +17,6 @@ namespace AnthropicChatCompletion.IntegrationTests;

 public class AnthropicChatCompletionFixture : IChatClientAgentFixture
 {
-    // All tests for Anthropic are intended to be ran locally as the CI pipeline for Anthropic is not setup.
-    internal const string SkipReason = "Integrations tests for local execution only";
-
    private readonly bool _useReasoningModel;
    private readonly bool _useBeta;

@@ -105,7 +102,22 @@ public class AnthropicChatCompletionFixture : IChatClientAgentFixture

    public async ValueTask InitializeAsync()
    {
-        Assert.SkipWhen(SkipReason is not null, SkipReason ?? string.Empty);
+        // Temporarily disabled: Anthropic SDK has a binary incompatibility with the current
+        // Microsoft.Extensions.AI version (WebSearchToolResultContent.Results method not found).
+        // See: https://github.com/microsoft/agent-framework/pull/5515
+        Assert.Skip("Anthropic integration tests temporarily disabled due to SDK incompatibility with Microsoft.Extensions.AI");
+
+        try
+        {
+            _ = TestConfiguration.GetRequiredValue(TestSettings.AnthropicApiKey);
+            _ = TestConfiguration.GetRequiredValue(TestSettings.AnthropicChatModelName);
+            _ = TestConfiguration.GetRequiredValue(TestSettings.AnthropicReasoningModelName);
+        }
+        catch (InvalidOperationException ex)
+        {
+            Assert.Skip("Anthropic configuration could not be loaded. Error:" + ex.Message);
+        }
+
        this._agent = await this.CreateChatClientAgentAsync();
    }

@@ -1,5 +1,6 @@
 // Copyright (c) Microsoft. All rights reserved.

+using System;
 using System.Threading.Tasks;
 using AgentConformance.IntegrationTests.Support;
 using Anthropic;
@@ -17,19 +18,28 @@ namespace AnthropicChatCompletion.IntegrationTests;
 /// Integration tests for Anthropic Skills functionality.
 /// These tests are designed to be run locally with a valid Anthropic API key.
 /// </summary>
+/// <remarks>
+/// Temporarily disabled due to Anthropic SDK binary incompatibility with
+/// the current Microsoft.Extensions.AI version (WebSearchToolResultContent.Results).
+/// </remarks>
+[Trait("Category", "IntegrationDisabled")]
 public sealed class AnthropicSkillsIntegrationTests
 {
-    // All tests for Anthropic are intended to be ran locally as the CI pipeline for Anthropic is not setup.
-    private const string SkipReason = "Integrations tests for local execution only";
-
    [Fact]
    public async Task CreateAgentWithPptxSkillAsync()
    {
-        Assert.SkipWhen(SkipReason is not null, SkipReason ?? string.Empty);
-
-        // Arrange
-        AnthropicClient anthropicClient = new() { ApiKey = TestConfiguration.GetRequiredValue(TestSettings.AnthropicApiKey) };
-        string model = TestConfiguration.GetRequiredValue(TestSettings.AnthropicChatModelName);
+        AnthropicClient? anthropicClient;
+        string? model;
+        try
+        {
+            anthropicClient = new() { ApiKey = TestConfiguration.GetRequiredValue(TestSettings.AnthropicApiKey) };
+            model = TestConfiguration.GetRequiredValue(TestSettings.AnthropicChatModelName);
+        }
+        catch (InvalidOperationException ex)
+        {
+            Assert.Skip("Anthropic configuration could not be loaded. Error:" + ex.Message);
+            return;
+        }

        BetaSkillParams pptxSkill = new()
        {
@@ -56,10 +66,16 @@ public sealed class AnthropicSkillsIntegrationTests
    [Fact]
    public async Task ListAnthropicManagedSkillsAsync()
    {
-        Assert.SkipWhen(SkipReason is not null, SkipReason ?? string.Empty);
-
-        // Arrange
-        AnthropicClient anthropicClient = new() { ApiKey = TestConfiguration.GetRequiredValue(TestSettings.AnthropicApiKey) };
+        AnthropicClient? anthropicClient;
+        try
+        {
+            anthropicClient = new() { ApiKey = TestConfiguration.GetRequiredValue(TestSettings.AnthropicApiKey) };
+        }
+        catch (InvalidOperationException ex)
+        {
+            Assert.Skip("Anthropic configuration could not be loaded. Error:" + ex.Message);
+            return;
+        }

        // Act
        SkillListPage skills = await anthropicClient.Beta.Skills.List(
@@ -13,8 +13,6 @@ namespace Microsoft.Agents.AI.DurableTask.IntegrationTests;
 [Trait("Category", "SampleValidation")]
 public sealed class ConsoleAppSamplesValidation(ITestOutputHelper outputHelper) : SamplesValidationBase(outputHelper)
 {
-    private const string SkipFlakyTimingTest = "Flaky: timing-dependent LLM test, see https://github.com/microsoft/agent-framework/issues/4971";
-
    private static readonly string s_samplesPath = Path.GetFullPath(
        Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "..", "..", "samples", "04-hosting", "DurableAgents", "ConsoleApps"));

@@ -69,7 +67,7 @@ public sealed class ConsoleAppSamplesValidation(ITestOutputHelper outputHelper)
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task SingleAgentOrchestrationChainingSampleValidationAsync()
    {
        using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts();
@@ -105,7 +103,7 @@ public sealed class ConsoleAppSamplesValidation(ITestOutputHelper outputHelper)
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task MultiAgentConcurrencySampleValidationAsync()
    {
        using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts();
@@ -160,7 +158,7 @@ public sealed class ConsoleAppSamplesValidation(ITestOutputHelper outputHelper)
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task MultiAgentConditionalSampleValidationAsync()
    {
        using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts();
@@ -237,14 +235,14 @@ public sealed class ConsoleAppSamplesValidation(ITestOutputHelper outputHelper)
        Assert.True(foundSuccess, "Orchestration did not complete successfully.");
    }

-    [Fact(Skip = SkipFlakyTimingTest)]
+    [RetryFact(2, 5000)]
    public async Task SingleAgentOrchestrationHITLSampleValidationAsync()
    {
        string samplePath = Path.Combine(s_samplesPath, "05_AgentOrchestration_HITL");

        await this.RunSampleTestAsync(samplePath, async (process, logs) =>
        {
-            using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts();
+            using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(TimeSpan.FromSeconds(180));

            // Start the HITL orchestration following the happy path from README
            await this.WriteInputAsync(process, "The Future of Artificial Intelligence", testTimeoutCts.Token);
@@ -260,7 +258,7 @@ public sealed class ConsoleAppSamplesValidation(ITestOutputHelper outputHelper)
            while ((line = this.ReadLogLine(logs, testTimeoutCts.Token)) != null)
            {
                // Look for notification that content is ready. The first time we see this, we should send a rejection.
-                // The second time we see this, we should send approval.
+                // Subsequent times we see this, we should send approval (LLM may produce extra review cycles).
                if (line.Contains("Content is ready for review", StringComparison.OrdinalIgnoreCase))
                {
                    if (!rejectionSent)
@@ -275,20 +273,15 @@ public sealed class ConsoleAppSamplesValidation(ITestOutputHelper outputHelper)
                            testTimeoutCts.Token);
                        rejectionSent = true;
                    }
-                    else if (!approvalSent)
+                    else
                    {
-                        // Prompt: Approve? (y/n):
+                        // Approve any subsequent draft (LLM non-determinism may produce extra review cycles)
                        await this.WriteInputAsync(process, "y", testTimeoutCts.Token);

                        // Prompt: Feedback (optional):
                        await this.WriteInputAsync(process, "Looks good!", testTimeoutCts.Token);
                        approvalSent = true;
                    }
-                    else
-                    {
-                        // This should never happen
-                        Assert.Fail("Unexpected message found.");
-                    }
                }

                // Look for success message
@@ -311,14 +304,14 @@ public sealed class ConsoleAppSamplesValidation(ITestOutputHelper outputHelper)
        });
    }

-    [Fact(Skip = SkipFlakyTimingTest)]
+    [RetryFact(2, 5000)]
    public async Task LongRunningToolsSampleValidationAsync()
    {
        string samplePath = Path.Combine(s_samplesPath, "06_LongRunningTools");
        await this.RunSampleTestAsync(samplePath, async (process, logs) =>
        {
            // This test takes a bit longer to run due to the multiple agent interactions and the lengthy content generation.
-            using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(TimeSpan.FromSeconds(90));
+            using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(TimeSpan.FromSeconds(180));

            // Test starting an agent that schedules a content generation orchestration
            await this.WriteInputAsync(
@@ -335,7 +328,7 @@ public sealed class ConsoleAppSamplesValidation(ITestOutputHelper outputHelper)
            while ((line = this.ReadLogLine(logs, testTimeoutCts.Token)) != null)
            {
                // Look for notification that content is ready. The first time we see this, we should send a rejection.
-                // The second time we see this, we should send approval.
+                // Subsequent times we see this, we should send approval (LLM may produce extra review cycles).
                if (line.Contains("NOTIFICATION: Please review the following content for approval", StringComparison.OrdinalIgnoreCase))
                {
                    // Wait for the notification to be fully written to the console
@@ -350,20 +343,15 @@ public sealed class ConsoleAppSamplesValidation(ITestOutputHelper outputHelper)
                            testTimeoutCts.Token);
                        rejectionSent = true;
                    }
-                    else if (!approvalSent)
+                    else
                    {
-                        // Approve the content. Note that we need to send a newline character to the console first before sending the input.
+                        // Approve any subsequent draft (LLM non-determinism may produce extra review cycles)
                        await this.WriteInputAsync(
                            process,
                            "\nApprove the content",
                            testTimeoutCts.Token);
                        approvalSent = true;
                    }
-                    else
-                    {
-                        // This should never happen
-                        Assert.Fail("Unexpected message found.");
-                    }
                }

                // Look for success message
@@ -396,14 +384,14 @@ public sealed class ConsoleAppSamplesValidation(ITestOutputHelper outputHelper)
        });
    }

-    [Fact(Skip = SkipFlakyTimingTest)]
+    [RetryFact(2, 5000)]
    public async Task ReliableStreamingSampleValidationAsync()
    {
        string samplePath = Path.Combine(s_samplesPath, "07_ReliableStreaming");
        await this.RunSampleTestAsync(samplePath, async (process, logs) =>
        {
            // This test takes a bit longer to run due to the multiple agent interactions and the lengthy content generation.
-            using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(TimeSpan.FromSeconds(90));
+            using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(TimeSpan.FromSeconds(150));

            // Test the agent endpoint with a simple prompt
            await this.WriteInputAsync(process, "Plan a 5-day trip to Seattle. Include daily activities.", testTimeoutCts.Token);
@@ -19,11 +19,9 @@ namespace Microsoft.Agents.AI.DurableTask.IntegrationTests;
 [Trait("Category", "Integration")]
 public sealed class ExternalClientTests(ITestOutputHelper outputHelper) : IDisposable
 {
-    private const string SkipFlakyTimingTest = "Flaky: timing-dependent LLM test, see https://github.com/microsoft/agent-framework/issues/4971";
-
    private static readonly TimeSpan s_defaultTimeout = Debugger.IsAttached
        ? TimeSpan.FromMinutes(5)
-        : TimeSpan.FromSeconds(60);
+        : TimeSpan.FromSeconds(120);

    private static readonly IConfiguration s_configuration =
        new ConfigurationBuilder()
@@ -38,7 +36,7 @@ public sealed class ExternalClientTests(ITestOutputHelper outputHelper) : IDispo

    public void Dispose() => this._cts.Dispose();

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task SimplePromptAsync()
    {
        // Setup
@@ -77,7 +75,7 @@ public sealed class ExternalClientTests(ITestOutputHelper outputHelper) : IDispo
        Assert.Contains(agentLogs, log => log.EventId.Name == "LogAgentResponse");
    }

-    [Fact(Skip = SkipFlakyTimingTest)]
+    [RetryFact(2, 5000)]
    public async Task CallFunctionToolsAsync()
    {
        int weatherToolInvocationCount = 0;
@@ -129,7 +127,7 @@ public sealed class ExternalClientTests(ITestOutputHelper outputHelper) : IDispo
        Assert.Equal(1, packingListToolInvocationCount);
    }

-    [Fact(Skip = SkipFlakyTimingTest)]
+    [RetryFact(2, 5000)]
    public async Task CallLongRunningFunctionToolsAsync()
    {
        [Description("Starts a greeting workflow and returns the workflow instance ID")]
@@ -217,7 +217,7 @@ public abstract class SamplesValidationBase : IAsyncLifetime
    /// </summary>
    protected CancellationTokenSource CreateTestTimeoutCts(TimeSpan? timeout = null)
    {
-        TimeSpan testTimeout = Debugger.IsAttached ? TimeSpan.FromMinutes(5) : timeout ?? TimeSpan.FromSeconds(60);
+        TimeSpan testTimeout = Debugger.IsAttached ? TimeSpan.FromMinutes(5) : timeout ?? TimeSpan.FromSeconds(120);
        return new CancellationTokenSource(testTimeout);
    }

@@ -22,7 +22,7 @@ public sealed class WorkflowConsoleAppSamplesValidation(ITestOutputHelper output
    /// <inheritdoc />
    protected override string TaskHubPrefix => "workflow";

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task SequentialWorkflowSampleValidationAsync()
    {
        using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -71,7 +71,7 @@ public sealed class WorkflowConsoleAppSamplesValidation(ITestOutputHelper output
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task ConcurrentWorkflowSampleValidationAsync()
    {
        using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -120,7 +120,7 @@ public sealed class WorkflowConsoleAppSamplesValidation(ITestOutputHelper output
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task ConditionalEdgesWorkflowSampleValidationAsync()
    {
        using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -182,7 +182,7 @@ public sealed class WorkflowConsoleAppSamplesValidation(ITestOutputHelper output
        }
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task WorkflowEventsSampleValidationAsync()
    {
        using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -278,7 +278,7 @@ public sealed class WorkflowConsoleAppSamplesValidation(ITestOutputHelper output
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task WorkflowSharedStateSampleValidationAsync()
    {
        using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -376,7 +376,7 @@ public sealed class WorkflowConsoleAppSamplesValidation(ITestOutputHelper output
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task SubWorkflowsSampleValidationAsync()
    {
        using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -452,7 +452,7 @@ public sealed class WorkflowConsoleAppSamplesValidation(ITestOutputHelper output
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task WorkflowHITLSampleValidationAsync()
    {
        using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -505,7 +505,7 @@ public sealed class WorkflowConsoleAppSamplesValidation(ITestOutputHelper output
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task WorkflowAndAgentsSampleValidationAsync()
    {
        using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -15,8 +15,6 @@ namespace Microsoft.Agents.AI.Hosting.AzureFunctions.IntegrationTests;
 [Trait("Category", "SampleValidation")]
 public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLifetime
 {
-    private const string SkipFlakyTimingTest = "Flaky: timing-dependent LLM test, see https://github.com/microsoft/agent-framework/issues/4971";
-
    private const string AzureFunctionsPort = "7071";
    private const string AzuritePort = "10000";
    private const string DtsPort = "8080";
@@ -37,7 +35,7 @@ public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLi
            .Build();

    private static bool s_infrastructureStarted;
-    private static readonly TimeSpan s_orchestrationTimeout = TimeSpan.FromMinutes(2);
+    private static readonly TimeSpan s_orchestrationTimeout = TimeSpan.FromMinutes(3);

    // In CI, `dotnet run` builds the Functions project from scratch before the host starts, so 60s is not enough.
    private static readonly TimeSpan s_functionsReadyTimeout = TimeSpan.FromSeconds(180);
@@ -62,7 +60,7 @@ public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLi
        await Task.CompletedTask;
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task SingleAgentSampleValidationAsync()
    {
        string samplePath = Path.Combine(s_samplesPath, "01_SingleAgent");
@@ -107,7 +105,7 @@ public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLi
        });
    }

-    [Fact]
+    [Fact(Skip = "Flaky: LLM non-determinism can produce null orchestration results")]
    public async Task SingleAgentOrchestrationChainingSampleValidationAsync()
    {
        string samplePath = Path.Combine(s_samplesPath, "02_AgentOrchestration_Chaining");
@@ -150,7 +148,7 @@ public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLi
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task MultiAgentOrchestrationConcurrentSampleValidationAsync()
    {
        string samplePath = Path.Combine(s_samplesPath, "03_AgentOrchestration_Concurrency");
@@ -200,7 +198,7 @@ public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLi
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task MultiAgentOrchestrationConditionalsSampleValidationAsync()
    {
        string samplePath = Path.Combine(s_samplesPath, "04_AgentOrchestration_Conditionals");
@@ -218,7 +216,7 @@ public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLi
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task SingleAgentOrchestrationHITLSampleValidationAsync()
    {
        string samplePath = Path.Combine(s_samplesPath, "05_AgentOrchestration_HITL");
@@ -274,7 +272,7 @@ public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLi
        });
    }

-    [Fact(Skip = SkipFlakyTimingTest)]
+    [RetryFact(2, 5000)]
    public async Task LongRunningToolsSampleValidationAsync()
    {
        string samplePath = Path.Combine(s_samplesPath, "06_LongRunningTools");
@@ -316,7 +314,7 @@ public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLi
                    }
                },
                message: "Orchestration is requesting human feedback",
-                timeout: TimeSpan.FromSeconds(60));
+                timeout: TimeSpan.FromSeconds(180));

            // Approve the content
            Uri approvalUri = new($"{runAgentUri}?thread_id={sessionId}");
@@ -336,7 +334,7 @@ public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLi
                    }
                },
                message: "Content published notification is logged",
-                timeout: TimeSpan.FromSeconds(60));
+                timeout: TimeSpan.FromSeconds(180));

            // Verify the final orchestration status by asking the agent for the status
            Uri statusUri = new($"{runAgentUri}?thread_id={sessionId}");
@@ -360,11 +358,11 @@ public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLi
                    return isCompleted && hasContent;
                },
                message: "Orchestration is completed",
-                timeout: TimeSpan.FromSeconds(60));
+                timeout: TimeSpan.FromSeconds(180));
        });
    }

-    [Fact]
+    [RetryFact(2, 5000)]
    public async Task AgentAsMcpToolAsync()
    {
        string samplePath = Path.Combine(s_samplesPath, "07_AgentAsMcpTool");
@@ -404,7 +402,7 @@ public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLi
        });
    }

-    [Fact(Skip = SkipFlakyTimingTest)]
+    [RetryFact(2, 5000)]
    public async Task ReliableStreamingSampleValidationAsync()
    {
        string samplePath = Path.Combine(s_samplesPath, "08_ReliableStreaming");
@@ -2,16 +2,18 @@

 """Aggregate per-provider JUnit XML test results and generate a trend report.

-Parses ``pytest.xml`` (JUnit XML) files produced by each CI job, merges them
-into a single run, combines with historical data, and generates a markdown
-trend table — the same pattern used by ``scripts/sample_validation/aggregate.py``.
+Parses JUnit XML files produced by CI jobs — both ``pytest.xml`` (Python) and
+xunit v3 ``*.junit`` (dotnet) — merges them into a single run, combines
+with historical data, and generates a markdown trend table.

 Usage (from CI):
    python aggregate.py <reports-dir> <history-file> <output-file>

-The reports directory is expected to contain subdirectories named
-``test-results-<provider>/`` each containing a ``pytest.xml`` file
-(created by ``actions/download-artifact``).
+The reports directory is expected to contain artifact subdirectories.  Two
+layouts are supported:
+
+- **Python (pytest):**  ``test-results-<provider>/pytest.xml``
+- **Dotnet (xunit):**   ``dotnet-test-results-<tfm>-<os>/*.junit``
 """

 from __future__ import annotations
@@ -46,9 +48,21 @@ def _format_run_label(timestamp: str) -> str:
 def _derive_provider(directory_name: str) -> str:
    """Derive a provider label from a report directory name.

-    ``test-results-openai`` → ``OpenAI``
-    ``test-results-azure-openai`` → ``Azure OpenAI``
+    Handles both Python and dotnet naming conventions:
+    - ``test-results-openai`` → ``OpenAI``
+    - ``test-results-azure-openai`` → ``Azure OpenAI``
+    - ``dotnet-test-results-net10.0-ubuntu-latest`` → ``net10.0 (ubuntu)``
    """
+    # Dotnet convention: dotnet-test-results-<framework>-<os>
+    if directory_name.startswith("dotnet-test-results-"):
+        raw = directory_name.replace("dotnet-test-results-", "")
+        # e.g. "net10.0-ubuntu-latest" → framework="net10.0", os="ubuntu-latest"
+        parts = raw.split("-", 1)
+        framework = parts[0]
+        os_label = parts[1].split("-")[0] if len(parts) > 1 else ""
+        return f"{framework} ({os_label})" if os_label else framework
+
+    # Python convention: test-results-<provider>
    raw = directory_name.replace("test-results-", "")
    known = {
        "openai": "OpenAI",
@@ -102,11 +116,21 @@ def _parse_junit_xml(xml_path: Path) -> list[dict[str, str]]:
        # it appends the class name, e.g.:
        #   "packages.foundry.tests.foundry.test_foundry_embedding_client.TestFoundryEmbeddingIntegration"
        # We want the file-level module: "test_foundry_embedding_client"
+        #
+        # xunit (dotnet) writes classname as the full C# type, e.g.:
+        #   "OpenAIChatCompletion.IntegrationTests.ChatCompletionTests"
+        # We want the project prefix: "OpenAIChatCompletion"
        if classname:
            parts = classname.rsplit(".", 2)
            # If the last segment starts with uppercase it's a class name — take the one before it
            if len(parts) >= 2 and parts[-1][0:1].isupper():
-                module = parts[-2]
+                # For dotnet: if the penultimate part is "IntegrationTests" or "UnitTests",
+                # use the part before that (the project name) instead
+                if parts[-2] in ("IntegrationTests", "UnitTests") and len(parts) >= 3:
+                    # parts[0] may contain dots — take the last segment of it
+                    module = parts[0].rsplit(".", 1)[-1]
+                else:
+                    module = parts[-2]
            else:
                module = parts[-1]
        else:
@@ -148,28 +172,61 @@ def _parse_junit_xml(xml_path: Path) -> list[dict[str, str]]:
 # ---------------------------------------------------------------------------


+def _discover_xml_files(reports_dir: Path) -> list[tuple[str, Path]]:
+    """Discover JUnit XML test result files in artifact subdirectories.
+
+    Handles two directory layouts:
+    - **Python (pytest):** ``test-results-<provider>/pytest.xml``
+    - **Dotnet (xunit):** ``dotnet-test-results-<tfm>-<os>/*.junit``
+
+    Returns:
+        List of ``(directory_name, xml_path)`` tuples.
+    """
+    xml_files: list[tuple[str, Path]] = []
+    if not reports_dir.is_dir():
+        return xml_files
+
+    for subdir in sorted(reports_dir.iterdir()):
+        if not subdir.is_dir():
+            continue
+
+        # Python layout: single pytest.xml per artifact
+        pytest_xml = subdir / "pytest.xml"
+        if pytest_xml.exists():
+            xml_files.append((subdir.name, pytest_xml))
+            continue
+
+        # Dotnet layout: multiple *.junit files per artifact
+        junit_files = sorted(subdir.rglob("*.junit"))
+        for jf in junit_files:
+            xml_files.append((subdir.name, jf))
+
+        # Fallback: any .xml file that looks like JUnit (not .trx, not cobertura)
+        if not junit_files:
+            for xf in sorted(subdir.rglob("*.xml")):
+                if xf.suffix == ".xml" and not xf.name.endswith(".cobertura.xml"):
+                    xml_files.append((subdir.name, xf))
+
+    return xml_files
+
+
 def load_current_run(reports_dir: Path) -> dict[str, Any]:
    """Load per-provider JUnit XML reports from the current CI run and merge.

+    Supports both pytest (Python) and xunit v3 (dotnet) JUnit XML formats.
+
    Args:
-        reports_dir: Directory containing ``test-results-<provider>/`` subdirs.
+        reports_dir: Directory containing artifact subdirectories with XML reports.

    Returns:
        Merged run dict with ``timestamp``, ``summary``, ``results``.
    """
    combined_results: dict[str, dict[str, str]] = {}  # nodeid → {status, provider}

-    # actions/download-artifact creates: reports_dir/test-results-openai/pytest.xml
-    xml_files: list[tuple[str, Path]] = []
-    if reports_dir.is_dir():
-        for subdir in sorted(reports_dir.iterdir()):
-            if subdir.is_dir():
-                xml_file = subdir / "pytest.xml"
-                if xml_file.exists():
-                    xml_files.append((subdir.name, xml_file))
+    xml_files = _discover_xml_files(reports_dir)

    if not xml_files:
-        print(f"Warning: No pytest.xml files found in {reports_dir}")
+        print(f"Warning: No JUnit XML files found in {reports_dir}")
        return {
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "summary": {
@@ -181,19 +238,42 @@ def load_current_run(reports_dir: Path) -> dict[str, Any]:
            "results": {},
        }

+    # Dotnet tests always run under multiple frameworks, so we always
+    # qualify their keys with the provider to ensure deterministic,
+    # stable keys across runs regardless of file parse order.
+    is_dotnet = any(d.startswith("dotnet-test-results-") for d, _ in xml_files)
+
    for dir_name, xml_file in xml_files:
        print(f"  Loading: {xml_file}")
        provider = _derive_provider(dir_name)
        tests = _parse_junit_xml(xml_file)
        for test in tests:
-            combined_results[test["nodeid"]] = {
+            raw_id = test["nodeid"]
+            key = f"{provider}::{raw_id}" if is_dotnet else raw_id
+
+            combined_results[key] = {
                "status": test["status"],
                "provider": provider,
                "module": test.get("module", ""),
            }

-    # Build summary counts using mutually exclusive status buckets.
-    # Errors are folded into the failed count for display purposes.
+    # Build per-provider summary counts so the report can show one row per
+    # framework (dotnet) or per provider (Python).
+    provider_counts: dict[str, dict[str, int]] = {}
+    for r in combined_results.values():
+        prov = r.get("provider", "Unknown")
+        if prov not in provider_counts:
+            provider_counts[prov] = {"total": 0, "passed": 0, "failed": 0, "skipped": 0}
+        provider_counts[prov]["total"] += 1
+        st = r["status"]
+        if st == "passed":
+            provider_counts[prov]["passed"] += 1
+        elif st in ("failed", "error"):
+            provider_counts[prov]["failed"] += 1
+        elif st == "skipped":
+            provider_counts[prov]["skipped"] += 1
+
+    # Overall summary (sum across all providers).
    statuses = [r["status"] for r in combined_results.values()]
    summary = {
        "total": len(statuses),
@@ -205,6 +285,7 @@ def load_current_run(reports_dir: Path) -> dict[str, Any]:
    return {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "summary": summary,
+        "provider_summaries": provider_counts,
        "results": combined_results,
    }

@@ -253,7 +334,29 @@ def generate_trend_report(runs: list[dict[str, Any]]) -> str:
        "",
    ]

-    # --- Overall status table (most recent first) ---
+    # Detect whether this is a dotnet report (provider-qualified keys).
+    is_dotnet = False
+    for run in runs:
+        provider_sums = run.get("provider_summaries", {})
+        if any(p.startswith("net") for p in provider_sums):
+            is_dotnet = True
+            break
+
+    if is_dotnet:
+        _generate_dotnet_report(lines, runs)
+    else:
+        _generate_python_report(lines, runs)
+
+    lines.append("")
+    lines.append("**Legend:** ✅ Passed · ❌ Failed · ⏭️ Skipped · ⚠️ Expected Failure (xfail) · N/A Not available")
+    lines.append("")
+
+    return "\n".join(lines)
+
+
+def _generate_python_report(lines: list[str], runs: list[dict[str, Any]]) -> None:
+    """Generate the original single-table Python report format."""
+    # --- Overall status table ---
    lines.append("## Overall Status (Last 5 Runs)")
    lines.append("")
    lines.append("| Run | Total | ✅ Passed | ❌ Failed | ⏭️ Skipped |")
@@ -276,27 +379,91 @@ def generate_trend_report(runs: list[dict[str, Any]]) -> str:

    lines.append("")

-    # --- Per-test results table ---
-    lines.append("## Per-Test Results")
-    lines.append("")
+    # --- Single per-test results table ---
+    _generate_per_test_table(lines, runs, "## Per-Test Results")

-    # Collect all test nodeids, providers, and modules across all runs
-    all_tests: dict[str, str] = {}  # nodeid → provider (from most recent run)
-    all_modules: dict[str, str] = {}  # nodeid → module (from most recent run)
+
+def _generate_dotnet_report(lines: list[str], runs: list[dict[str, Any]]) -> None:
+    """Generate per-framework tables for dotnet (net10.0, net472, etc.)."""
+    # Collect all providers seen across all runs, sorted for stable ordering
+    all_providers: set[str] = set()
+    for run in runs:
+        all_providers.update(run.get("provider_summaries", {}).keys())
+    providers = sorted(all_providers)
+
+    for provider in providers:
+        lines.append(f"## {provider}")
+        lines.append("")
+
+        # --- Per-provider summary table ---
+        lines.append("| Run | Total | ✅ Passed | ❌ Failed | ⏭️ Skipped |")
+        lines.append("|-----|-------|-----------|-----------|------------|")
+
+        for run in reversed(runs):
+            ps = run.get("provider_summaries", {}).get(provider, {})
+            total = ps.get("total", 0)
+            label = _format_run_label(run["timestamp"])
+            if total == 0:
+                lines.append(f"| {label} | N/A | N/A | N/A | N/A |")
+            else:
+                lines.append(
+                    f"| {label} "
+                    f"| {total} "
+                    f"| {ps.get('passed', 0)}/{total} "
+                    f"| {ps.get('failed', 0)}/{total} "
+                    f"| {ps.get('skipped', 0)}/{total} |"
+                )
+
+        for _ in range(MAX_HISTORY - len(runs)):
+            lines.append("| N/A | N/A | N/A | N/A | N/A |")
+
+        lines.append("")
+
+        # --- Per-test table filtered to this provider ---
+        _generate_per_test_table(
+            lines, runs,
+            heading=None,
+            provider_filter=provider,
+        )
+
+
+def _generate_per_test_table(
+    lines: list[str],
+    runs: list[dict[str, Any]],
+    heading: str | None = None,
+    provider_filter: str | None = None,
+) -> None:
+    """Emit a per-test trend table, optionally filtered to a single provider."""
+    if heading:
+        lines.append(heading)
+        lines.append("")
+
+    # Collect all test nodeids (and metadata) across all runs
+    all_tests: dict[str, str] = {}  # nodeid → provider
+    all_modules: dict[str, str] = {}  # nodeid → module
    for run in runs:
        for nodeid, info in run.get("results", {}).items():
-            provider = info.get("provider", "Unknown") if isinstance(info, dict) else "Unknown"
-            module = info.get("module", "") if isinstance(info, dict) else ""
-            all_tests[nodeid] = provider
+            if not isinstance(info, dict):
+                continue
+            prov = info.get("provider", "Unknown")
+            if provider_filter and prov != provider_filter:
+                continue
+            module = info.get("module", "")
+            all_tests[nodeid] = prov
            all_modules[nodeid] = module

    if not all_tests:
        lines.append("*No test results available.*")
-        return "\n".join(lines)
+        lines.append("")
+        return

-    # Build header (most recent run first)
-    header = "| Test | File | Provider |"
-    separator = "|------|------|----------|"
+    # Build header
+    if provider_filter:
+        header = "| Test | File |"
+        separator = "|------|------|"
+    else:
+        header = "| Test | File | Provider |"
+        separator = "|------|------|----------|"
    for run in reversed(runs):
        label = _format_run_label(run["timestamp"])
        header += f" {label} |"
@@ -308,12 +475,15 @@ def generate_trend_report(runs: list[dict[str, Any]]) -> str:
    lines.append(header)
    lines.append(separator)

-    # Sort by provider then test name
-    for nodeid in sorted(all_tests, key=lambda n: (all_tests[n], n)):
-        provider = all_tests[nodeid]
+    # Sort by module then test name
+    for nodeid in sorted(all_tests, key=lambda n: (all_modules.get(n, ""), n)):
        module = all_modules.get(nodeid, "")
        short = _short_name(nodeid)
-        row = f"| `{short}` | `{module}` | {provider} |"
+        if provider_filter:
+            row = f"| `{short}` | `{module}` |"
+        else:
+            provider = all_tests[nodeid]
+            row = f"| `{short}` | `{module}` | {provider} |"

        for run in reversed(runs):
            result = run.get("results", {}).get(nodeid)
@@ -330,10 +500,6 @@ def generate_trend_report(runs: list[dict[str, Any]]) -> str:
        lines.append(row)

    lines.append("")
-    lines.append("**Legend:** ✅ Passed · ❌ Failed · ⏭️ Skipped · ⚠️ Expected Failure (xfail) · N/A Not available")
-    lines.append("")
-
-    return "\n".join(lines)


 # ---------------------------------------------------------------------------