mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
6d6cb840ae
* Improve resilience of verify-samples by building separately and improving evaluation instructions * Address PR comments * Address PR comment
230 lines
8.6 KiB
C#
230 lines
8.6 KiB
C#
// Copyright (c) Microsoft. All rights reserved.
|
|
|
|
using System.ComponentModel;
|
|
using System.Text.Json.Serialization;
|
|
using Microsoft.Agents.AI;
|
|
using Microsoft.Extensions.AI;
|
|
using OpenAI.Chat;
|
|
|
|
namespace VerifySamples;
|
|
|
|
/// <summary>
|
|
/// Verifies sample output using deterministic checks and an AI agent
|
|
/// for non-deterministic output validation.
|
|
/// </summary>
|
|
internal sealed class SampleVerifier
|
|
{
|
|
private readonly AIAgent? _verifierAgent;
|
|
|
|
/// <summary>
|
|
/// Creates a verifier. If <paramref name="chatClient"/> is provided,
|
|
/// AI-based verification is available for non-deterministic samples.
|
|
/// </summary>
|
|
public SampleVerifier(ChatClient? chatClient = null)
|
|
{
|
|
if (chatClient is not null)
|
|
{
|
|
this._verifierAgent = chatClient.AsAIAgent(
|
|
instructions: """
|
|
You are a test output verifier. You will be given:
|
|
1. The actual stdout output of a program
|
|
2. The stderr output (if any)
|
|
3. A list of expectations about what the output should contain or demonstrate
|
|
|
|
Your job is to determine whether the actual output satisfies each expectation.
|
|
Be reasonable — the output comes from an LLM so exact wording won't match, but the
|
|
semantic intent should be clearly satisfied.
|
|
|
|
In your response, you MUST:
|
|
- Always provide ai_reasoning with a brief overall assessment.
|
|
- Always provide exactly one entry in expectation_results for each expectation,
|
|
in the same order as the input list.
|
|
- For each expectation_results entry, echo the expectation text in the expectation
|
|
field and explain your assessment in the detail field, citing evidence from the output.
|
|
""",
|
|
name: "OutputVerifier");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Verifies the output of a sample run against its definition.
|
|
/// </summary>
|
|
public async Task<VerificationResult> VerifyAsync(SampleDefinition sample, SampleRunResult run)
|
|
{
|
|
var failures = new List<string>();
|
|
|
|
// 1. Exit code check
|
|
if (run.ExitCode != 0)
|
|
{
|
|
failures.Add($"Exit code was {run.ExitCode}, expected 0. Stderr: {Truncate(run.Stderr, 500)}");
|
|
}
|
|
|
|
// 2. Must-contain checks
|
|
foreach (var expected in sample.MustContain)
|
|
{
|
|
if (!run.Stdout.Contains(expected, StringComparison.Ordinal))
|
|
{
|
|
failures.Add($"Output missing expected substring: \"{expected}\"");
|
|
}
|
|
}
|
|
|
|
// 3. Must-not-contain checks
|
|
foreach (var unexpected in sample.MustNotContain)
|
|
{
|
|
if (run.Stdout.Contains(unexpected, StringComparison.Ordinal))
|
|
{
|
|
failures.Add($"Output contains unexpected substring: \"{unexpected}\"");
|
|
}
|
|
}
|
|
|
|
// 4. AI verification for non-deterministic samples
|
|
string? aiReasoning = null;
|
|
if (!sample.IsDeterministic && sample.ExpectedOutputDescription.Length > 0)
|
|
{
|
|
if (this._verifierAgent is null)
|
|
{
|
|
failures.Add("AI verification required but no AI agent configured (missing AZURE_OPENAI_ENDPOINT).");
|
|
}
|
|
else
|
|
{
|
|
var aiResult = await this.VerifyWithAIAsync(run.Stdout, run.Stderr, sample.ExpectedOutputDescription);
|
|
aiReasoning = aiResult.Reasoning;
|
|
|
|
foreach (var unmet in aiResult.UnmetExpectations)
|
|
{
|
|
failures.Add($"AI expectation not met: {unmet}");
|
|
}
|
|
}
|
|
}
|
|
|
|
bool passed = failures.Count == 0;
|
|
return new VerificationResult
|
|
{
|
|
SampleName = sample.Name,
|
|
Passed = passed,
|
|
Summary = passed ? "All checks passed" : $"{failures.Count} check(s) failed",
|
|
Failures = failures,
|
|
AIReasoning = aiReasoning,
|
|
};
|
|
}
|
|
|
|
private async Task<(string Reasoning, List<string> UnmetExpectations)> VerifyWithAIAsync(
|
|
string stdout,
|
|
string stderr,
|
|
string[] expectations)
|
|
{
|
|
var expectationList = string.Join("\n", expectations.Select((e, i) => $" {i + 1}. {e}"));
|
|
|
|
var stderrSection = string.IsNullOrWhiteSpace(stderr)
|
|
? ""
|
|
: $"""
|
|
|
|
Stderr output:
|
|
---
|
|
{Truncate(stderr, 2000)}
|
|
---
|
|
""";
|
|
|
|
var prompt = $"""
|
|
Actual program output:
|
|
---
|
|
{Truncate(stdout, 4000)}
|
|
---
|
|
{stderrSection}
|
|
Expectations to verify:
|
|
{expectationList}
|
|
|
|
Does the output satisfy all expectations?
|
|
""";
|
|
|
|
try
|
|
{
|
|
var response = await this._verifierAgent!.RunAsync<AIVerificationResponse>(prompt);
|
|
var result = response.Result;
|
|
|
|
if (result is null)
|
|
{
|
|
return ($"AI verification returned null result. Raw: {response.Text}", ["AI verification returned null result."]);
|
|
}
|
|
|
|
var reasoning = string.IsNullOrWhiteSpace(result.AIReasoning)
|
|
? "(no reasoning provided)"
|
|
: result.AIReasoning;
|
|
|
|
// Collect unmet expectations as individual failures
|
|
var unmet = new List<string>();
|
|
if (result.ExpectationResults is { Count: > 0 })
|
|
{
|
|
foreach (var er in result.ExpectationResults.Where(er => !er.Met))
|
|
{
|
|
var detail = string.IsNullOrWhiteSpace(er.Detail) ? er.Expectation : $"{er.Expectation} — {er.Detail}";
|
|
unmet.Add(detail ?? "Unknown expectation");
|
|
}
|
|
|
|
// If the model flagged overall failure but all individual expectations were met,
|
|
// still treat as failure using the overall reasoning.
|
|
if (unmet.Count == 0 && !result.Pass)
|
|
{
|
|
unmet.Add(reasoning);
|
|
}
|
|
}
|
|
else if (!result.Pass)
|
|
{
|
|
// Fallback: no per-expectation detail but overall pass is false
|
|
unmet.Add(reasoning);
|
|
}
|
|
|
|
return (reasoning, unmet);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
return ($"AI verification error: {ex.Message}", [$"AI verification error: {ex.Message}"]);
|
|
}
|
|
}
|
|
|
|
private static string Truncate(string text, int maxLength)
|
|
=> text.Length <= maxLength ? text : text[..maxLength] + "... (truncated)";
|
|
}
|
|
|
|
/// <summary>
|
|
/// Structured response from the AI verification agent.
|
|
/// </summary>
|
|
[System.Diagnostics.CodeAnalysis.SuppressMessage("Performance", "CA1812:Avoid uninstantiated internal classes", Justification = "Instantiated by JSON deserialization via RunAsync<T>.")]
|
|
internal sealed class AIVerificationResponse
|
|
{
|
|
/// <summary>Whether all expectations were met.</summary>
|
|
[JsonPropertyName("pass")]
|
|
public bool Pass { get; set; }
|
|
|
|
/// <summary>Brief explanation of the overall assessment.</summary>
|
|
[JsonPropertyName("ai_reasoning")]
|
|
[Description("Always required. Brief explanation of the overall assessment, covering all expectations.")]
|
|
public string AIReasoning { get; set; } = string.Empty;
|
|
|
|
/// <summary>Per-expectation results.</summary>
|
|
[JsonPropertyName("expectation_results")]
|
|
[Description("Always required. One entry per expectation, in the same order as the input list.")]
|
|
public List<ExpectationResult> ExpectationResults { get; set; } = [];
|
|
}
|
|
|
|
/// <summary>
|
|
/// Result for an individual expectation check.
|
|
/// </summary>
|
|
[System.Diagnostics.CodeAnalysis.SuppressMessage("Performance", "CA1812:Avoid uninstantiated internal classes", Justification = "Instantiated by JSON deserialization via RunAsync<T>.")]
|
|
internal sealed class ExpectationResult
|
|
{
|
|
/// <summary>The expectation text that was evaluated.</summary>
|
|
[JsonPropertyName("expectation")]
|
|
[Description("Echo back the expectation text being evaluated.")]
|
|
public string Expectation { get; set; } = string.Empty;
|
|
|
|
/// <summary>Whether this expectation was met.</summary>
|
|
[JsonPropertyName("met")]
|
|
public bool Met { get; set; }
|
|
|
|
/// <summary>Detail about how the expectation was or was not met.</summary>
|
|
[JsonPropertyName("detail")]
|
|
[Description("Explain how the expectation was or was not met, citing specific evidence from the output.")]
|
|
public string Detail { get; set; } = string.Empty;
|
|
}
|