diff --git a/.github/workflows/dotnet-verify-samples.yml b/.github/workflows/dotnet-verify-samples.yml index ad384eb83e..b1c13a275f 100644 --- a/.github/workflows/dotnet-verify-samples.yml +++ b/.github/workflows/dotnet-verify-samples.yml @@ -63,6 +63,11 @@ jobs: tenant-id: ${{ secrets.AZURE_TENANT_ID }} subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + - name: Build solution + working-directory: dotnet + shell: bash + run: dotnet build agent-framework-dotnet.slnx -f net10.0 --warnaserror + - name: Run verify-samples id: verify working-directory: dotnet diff --git a/dotnet/.github/skills/verify-samples-tool/SKILL.md b/dotnet/.github/skills/verify-samples-tool/SKILL.md index cbb1b35009..4d4f153bfd 100644 --- a/dotnet/.github/skills/verify-samples-tool/SKILL.md +++ b/dotnet/.github/skills/verify-samples-tool/SKILL.md @@ -9,9 +9,16 @@ The `verify-samples` project (`dotnet/eng/verify-samples/`) is an automated tool ## Running verify-samples +**Important:** By default, samples must be pre-built before running verify-samples. Build the solution first, or pass `--build` to build samples during the run: + ```bash cd dotnet +dotnet build agent-framework-dotnet.slnx -f net10.0 +``` +Then run verify-samples: + +```bash # Run all samples across all categories dotnet run --project eng/verify-samples -- --log results.log --csv results.csv @@ -24,6 +31,10 @@ dotnet run --project eng/verify-samples -- Agent_Step02_StructuredOutput Agent_S # Control parallelism (default 8) dotnet run --project eng/verify-samples -- --parallel 8 --log results.log +# Build samples during run (skips the need for a prior build step) +# This may cause build conflicts as multiple samples are built in parallel, so use with caution +dotnet run --project eng/verify-samples -- --build --log results.log + # Combine options dotnet run --project eng/verify-samples -- --category 03-workflows --parallel 4 --log results.log --csv results.csv --md results.md ``` diff --git a/dotnet/eng/verify-samples/Program.cs b/dotnet/eng/verify-samples/Program.cs index 7f27d37dd5..ebddc4b16b 100644 --- a/dotnet/eng/verify-samples/Program.cs +++ b/dotnet/eng/verify-samples/Program.cs @@ -14,6 +14,9 @@ // dotnet run -- --log results.log # Write sequential log to file // dotnet run -- --csv results.csv # Write CSV summary to file // dotnet run -- --md results.md # Write Markdown summary to file +// dotnet run -- --build # Build samples during run (default: --no-build) +// Note: By default, this tool expects sample build outputs to already exist. +// Pre-build the solution before running, or pass --build to avoid missing build output failures. // // Required environment variables (for AI-powered samples): // AZURE_OPENAI_ENDPOINT @@ -63,7 +66,7 @@ try // Run all samples var reporter = new ConsoleReporter(); var verifier = new SampleVerifier(chatClient); - var orchestrator = new VerificationOrchestrator(verifier, reporter, dotnetRoot, TimeSpan.FromMinutes(3), logWriter); + var orchestrator = new VerificationOrchestrator(verifier, reporter, dotnetRoot, TimeSpan.FromMinutes(3), logWriter, buildSamples: options.BuildSamples); var run = await orchestrator.RunAllAsync(options.Samples, options.MaxParallelism); diff --git a/dotnet/eng/verify-samples/SampleRunner.cs b/dotnet/eng/verify-samples/SampleRunner.cs index 0fabd82262..f8bd3cc0e6 100644 --- a/dotnet/eng/verify-samples/SampleRunner.cs +++ b/dotnet/eng/verify-samples/SampleRunner.cs @@ -20,23 +20,32 @@ internal static class SampleRunner { /// /// Runs dotnet run --framework net10.0 in the given project directory. + /// When is false (the default), --no-build is passed + /// to skip building, assuming the project was pre-built. /// public static Task RunAsync( string projectPath, TimeSpan timeout, + bool build = false, CancellationToken cancellationToken = default) - => RunAsync(projectPath, "run --framework net10.0", timeout, inputs: null, inputDelayMs: 0, cancellationToken: cancellationToken); + => RunAsync(projectPath, DotnetRunArgs(build), timeout, inputs: null, inputDelayMs: 0, cancellationToken: cancellationToken); /// /// Runs dotnet run --framework net10.0 with stdin inputs. + /// When is false (the default), --no-build is passed + /// to skip building, assuming the project was pre-built. /// public static Task RunAsync( string projectPath, TimeSpan timeout, string?[]? inputs, int inputDelayMs = 2000, + bool build = false, CancellationToken cancellationToken = default) - => RunAsync(projectPath, "run --framework net10.0", timeout, inputs, inputDelayMs, cancellationToken); + => RunAsync(projectPath, DotnetRunArgs(build), timeout, inputs, inputDelayMs, cancellationToken); + + private static string DotnetRunArgs(bool build) => + $"run {(build ? "" : "--no-build")} --framework net10.0"; /// /// Runs an arbitrary dotnet command in the given working directory. diff --git a/dotnet/eng/verify-samples/SampleVerifier.cs b/dotnet/eng/verify-samples/SampleVerifier.cs index 9dc17b1769..ae28aa835f 100644 --- a/dotnet/eng/verify-samples/SampleVerifier.cs +++ b/dotnet/eng/verify-samples/SampleVerifier.cs @@ -1,5 +1,6 @@ // Copyright (c) Microsoft. All rights reserved. +using System.ComponentModel; using System.Text.Json.Serialization; using Microsoft.Agents.AI; using Microsoft.Extensions.AI; @@ -27,11 +28,19 @@ internal sealed class SampleVerifier instructions: """ You are a test output verifier. You will be given: 1. The actual stdout output of a program - 2. A list of expectations about what the output should contain or demonstrate + 2. The stderr output (if any) + 3. A list of expectations about what the output should contain or demonstrate Your job is to determine whether the actual output satisfies each expectation. Be reasonable — the output comes from an LLM so exact wording won't match, but the semantic intent should be clearly satisfied. + + In your response, you MUST: + - Always provide ai_reasoning with a brief overall assessment. + - Always provide exactly one entry in expectation_results for each expectation, + in the same order as the input list. + - For each expectation_results entry, echo the expectation text in the expectation + field and explain your assessment in the detail field, citing evidence from the output. """, name: "OutputVerifier"); } @@ -78,7 +87,7 @@ internal sealed class SampleVerifier } else { - var aiResult = await this.VerifyWithAIAsync(run.Stdout, sample.ExpectedOutputDescription); + var aiResult = await this.VerifyWithAIAsync(run.Stdout, run.Stderr, sample.ExpectedOutputDescription); aiReasoning = aiResult.Reasoning; foreach (var unmet in aiResult.UnmetExpectations) @@ -100,16 +109,28 @@ internal sealed class SampleVerifier } private async Task<(string Reasoning, List UnmetExpectations)> VerifyWithAIAsync( - string actualOutput, + string stdout, + string stderr, string[] expectations) { var expectationList = string.Join("\n", expectations.Select((e, i) => $" {i + 1}. {e}")); + + var stderrSection = string.IsNullOrWhiteSpace(stderr) + ? "" + : $""" + + Stderr output: + --- + {Truncate(stderr, 2000)} + --- + """; + var prompt = $""" Actual program output: --- - {Truncate(actualOutput, 4000)} + {Truncate(stdout, 4000)} --- - + {stderrSection} Expectations to verify: {expectationList} @@ -126,7 +147,9 @@ internal sealed class SampleVerifier return ($"AI verification returned null result. Raw: {response.Text}", ["AI verification returned null result."]); } - var reasoning = result.Reasoning ?? "(no reasoning provided)"; + var reasoning = string.IsNullOrWhiteSpace(result.AIReasoning) + ? "(no reasoning provided)" + : result.AIReasoning; // Collect unmet expectations as individual failures var unmet = new List(); @@ -174,12 +197,14 @@ internal sealed class AIVerificationResponse public bool Pass { get; set; } /// Brief explanation of the overall assessment. - [JsonPropertyName("reasoning")] - public string? Reasoning { get; set; } + [JsonPropertyName("ai_reasoning")] + [Description("Always required. Brief explanation of the overall assessment, covering all expectations.")] + public string AIReasoning { get; set; } = string.Empty; /// Per-expectation results. [JsonPropertyName("expectation_results")] - public List? ExpectationResults { get; set; } + [Description("Always required. One entry per expectation, in the same order as the input list.")] + public List ExpectationResults { get; set; } = []; } /// @@ -190,7 +215,8 @@ internal sealed class ExpectationResult { /// The expectation text that was evaluated. [JsonPropertyName("expectation")] - public string? Expectation { get; set; } + [Description("Echo back the expectation text being evaluated.")] + public string Expectation { get; set; } = string.Empty; /// Whether this expectation was met. [JsonPropertyName("met")] @@ -198,5 +224,6 @@ internal sealed class ExpectationResult /// Detail about how the expectation was or was not met. [JsonPropertyName("detail")] - public string? Detail { get; set; } + [Description("Explain how the expectation was or was not met, citing specific evidence from the output.")] + public string Detail { get; set; } = string.Empty; } diff --git a/dotnet/eng/verify-samples/VerificationOrchestrator.cs b/dotnet/eng/verify-samples/VerificationOrchestrator.cs index 1ce805bc5a..b55efc9c14 100644 --- a/dotnet/eng/verify-samples/VerificationOrchestrator.cs +++ b/dotnet/eng/verify-samples/VerificationOrchestrator.cs @@ -14,19 +14,22 @@ internal sealed class VerificationOrchestrator private readonly LogFileWriter? _logWriter; private readonly string _dotnetRoot; private readonly TimeSpan _timeout; + private readonly bool _buildSamples; public VerificationOrchestrator( SampleVerifier verifier, ConsoleReporter reporter, string dotnetRoot, TimeSpan timeout, - LogFileWriter? logWriter = null) + LogFileWriter? logWriter = null, + bool buildSamples = false) { this._verifier = verifier; this._reporter = reporter; this._logWriter = logWriter; this._dotnetRoot = dotnetRoot; this._timeout = timeout; + this._buildSamples = buildSamples; } /// @@ -136,8 +139,8 @@ internal sealed class VerificationOrchestrator var projectPath = Path.Combine(this._dotnetRoot, sample.ProjectPath); var run = sample.Inputs.Length > 0 - ? await SampleRunner.RunAsync(projectPath, this._timeout, sample.Inputs, sample.InputDelayMs) - : await SampleRunner.RunAsync(projectPath, this._timeout); + ? await SampleRunner.RunAsync(projectPath, this._timeout, sample.Inputs, sample.InputDelayMs, build: this._buildSamples) + : await SampleRunner.RunAsync(projectPath, this._timeout, build: this._buildSamples); log.Add($"[{sample.Name}] Completed ({run.Elapsed.TotalSeconds:F1}s, exit={run.ExitCode})"); this._reporter.WriteLineWithPrefix( diff --git a/dotnet/eng/verify-samples/VerifyOptions.cs b/dotnet/eng/verify-samples/VerifyOptions.cs index 78ba38acf1..95e0af8795 100644 --- a/dotnet/eng/verify-samples/VerifyOptions.cs +++ b/dotnet/eng/verify-samples/VerifyOptions.cs @@ -27,6 +27,12 @@ internal sealed class VerifyOptions /// public string? LogFilePath { get; init; } + /// + /// When true, samples are built as part of dotnet run. + /// When false (the default), --no-build is passed, assuming a prior build step. + /// + public bool BuildSamples { get; init; } + /// /// The filtered list of samples to process. /// @@ -55,6 +61,7 @@ internal sealed class VerifyOptions var logFilePath = ExtractArg(argList, "--log"); var csvFilePath = ExtractArg(argList, "--csv"); var markdownFilePath = ExtractArg(argList, "--md"); + var buildSamples = ExtractFlag(argList, "--build"); int maxParallelism = 8; var parallelArg = ExtractArg(argList, "--parallel"); @@ -105,6 +112,7 @@ internal sealed class VerifyOptions LogFilePath = logFilePath, CsvFilePath = csvFilePath, MarkdownFilePath = markdownFilePath, + BuildSamples = buildSamples, Samples = samples, }; } @@ -128,4 +136,16 @@ internal sealed class VerifyOptions list.RemoveRange(idx, 2); return value; } + + private static bool ExtractFlag(List list, string flag) + { + var idx = list.IndexOf(flag); + if (idx < 0) + { + return false; + } + + list.RemoveAt(idx); + return true; + } }