mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
Python: feat(evals): add ground_truth support for similarity evaluator (#5234)
* feat(evals): add ground_truth support for similarity evaluator - Include expected_output as ground_truth in Foundry JSONL dataset rows - Add ground_truth to item schema and data mapping for similarity evaluator - Add expected_output parameter to evaluate_workflow - Add similarity Pattern 3 to evaluate_agent and evaluate_workflow samples - Add tests for ground_truth in dataset, schema, and evaluate_workflow * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix: wrap long line to satisfy ruff E501 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
Unverified
parent
8f17067383
commit
aa582d021d
@@ -1659,6 +1659,7 @@ async def evaluate_workflow(
|
||||
workflow: Workflow,
|
||||
workflow_result: WorkflowRunResult | None = None,
|
||||
queries: str | Sequence[str] | None = None,
|
||||
expected_output: str | Sequence[str] | None = None,
|
||||
evaluators: Evaluator | Callable[..., Any] | Sequence[Evaluator | Callable[..., Any]],
|
||||
eval_name: str | None = None,
|
||||
include_overall: bool = True,
|
||||
@@ -1683,6 +1684,11 @@ async def evaluate_workflow(
|
||||
workflow: The workflow instance.
|
||||
workflow_result: A completed ``WorkflowRunResult``.
|
||||
queries: Test queries to run through the workflow.
|
||||
expected_output: Ground-truth expected output(s), one per query. A
|
||||
single string is wrapped into a one-element list. When provided,
|
||||
must be the same length as ``queries``. Each value is stamped on
|
||||
the corresponding ``EvalItem.expected_output`` for evaluators
|
||||
that compare against a reference answer (e.g. similarity).
|
||||
evaluators: One or more ``Evaluator`` instances.
|
||||
eval_name: Display name for the evaluation.
|
||||
include_overall: Whether to evaluate the workflow's final output.
|
||||
@@ -1720,10 +1726,20 @@ async def evaluate_workflow(
|
||||
# Normalize singular query to list
|
||||
if isinstance(queries, str):
|
||||
queries = [queries]
|
||||
if isinstance(expected_output, str):
|
||||
expected_output = [expected_output]
|
||||
|
||||
if workflow_result is None and queries is None:
|
||||
raise ValueError("Provide either 'workflow_result' or 'queries'.")
|
||||
|
||||
if expected_output is not None and queries is None:
|
||||
raise ValueError(
|
||||
"Provide 'queries' when using 'expected_output';"
|
||||
" 'expected_output' is not supported with 'workflow_result' only."
|
||||
)
|
||||
if expected_output is not None and queries is not None and len(expected_output) != len(queries):
|
||||
raise ValueError(f"Got {len(queries)} queries but {len(expected_output)} expected_output values.")
|
||||
|
||||
if num_repetitions < 1:
|
||||
raise ValueError(f"num_repetitions must be >= 1, got {num_repetitions}.")
|
||||
|
||||
@@ -1737,7 +1753,7 @@ async def evaluate_workflow(
|
||||
if queries is not None:
|
||||
results_list: list[WRR] = []
|
||||
for _rep in range(num_repetitions):
|
||||
for q in queries:
|
||||
for qi, q in enumerate(queries):
|
||||
result = await workflow.run(q)
|
||||
if not isinstance(result, WRR):
|
||||
raise TypeError(f"Expected WorkflowRunResult from workflow.run(), got {type(result).__name__}.")
|
||||
@@ -1746,6 +1762,8 @@ async def evaluate_workflow(
|
||||
if include_overall:
|
||||
overall_item = _build_overall_item(q, result)
|
||||
if overall_item:
|
||||
if expected_output is not None:
|
||||
overall_item.expected_output = expected_output[qi]
|
||||
overall_items.append(overall_item)
|
||||
else:
|
||||
assert workflow_result is not None # noqa: S101 # nosec B101
|
||||
|
||||
@@ -75,6 +75,11 @@ _TOOL_EVALUATORS: set[str] = {
|
||||
"builtin.tool_call_success",
|
||||
}
|
||||
|
||||
# Evaluators that require a ground_truth / expected_output field.
|
||||
_GROUND_TRUTH_EVALUATORS: set[str] = {
|
||||
"builtin.similarity",
|
||||
}
|
||||
|
||||
_BUILTIN_EVALUATORS: dict[str, str] = {
|
||||
# Agent behavior
|
||||
"intent_resolution": "builtin.intent_resolution",
|
||||
@@ -196,6 +201,8 @@ def _build_testing_criteria(
|
||||
}
|
||||
if qualified == "builtin.groundedness":
|
||||
mapping["context"] = "{{item.context}}"
|
||||
if qualified in _GROUND_TRUTH_EVALUATORS:
|
||||
mapping["ground_truth"] = "{{item.ground_truth}}"
|
||||
if qualified in _TOOL_EVALUATORS:
|
||||
mapping["tool_definitions"] = "{{item.tool_definitions}}"
|
||||
entry["data_mapping"] = mapping
|
||||
@@ -204,7 +211,9 @@ def _build_testing_criteria(
|
||||
return criteria
|
||||
|
||||
|
||||
def _build_item_schema(*, has_context: bool = False, has_tools: bool = False) -> dict[str, Any]:
|
||||
def _build_item_schema(
|
||||
*, has_context: bool = False, has_tools: bool = False, has_ground_truth: bool = False
|
||||
) -> dict[str, Any]:
|
||||
"""Build the ``item_schema`` for custom JSONL eval definitions."""
|
||||
properties: dict[str, Any] = {
|
||||
"query": {"type": "string"},
|
||||
@@ -214,6 +223,8 @@ def _build_item_schema(*, has_context: bool = False, has_tools: bool = False) ->
|
||||
}
|
||||
if has_context:
|
||||
properties["context"] = {"type": "string"}
|
||||
if has_ground_truth:
|
||||
properties["ground_truth"] = {"type": "string"}
|
||||
if has_tools:
|
||||
properties["tool_definitions"] = {"type": "array"}
|
||||
return {
|
||||
@@ -681,16 +692,21 @@ class FoundryEvals:
|
||||
]
|
||||
if item.context:
|
||||
d["context"] = item.context
|
||||
if item.expected_output is not None:
|
||||
d["ground_truth"] = item.expected_output
|
||||
dicts.append(d)
|
||||
|
||||
has_context = any("context" in d for d in dicts)
|
||||
has_ground_truth = any("ground_truth" in d for d in dicts)
|
||||
has_tools = any("tool_definitions" in d for d in dicts)
|
||||
|
||||
eval_obj = await self._client.evals.create(
|
||||
name=eval_name,
|
||||
data_source_config={ # type: ignore[arg-type] # pyright: ignore[reportArgumentType]
|
||||
"type": "custom",
|
||||
"item_schema": _build_item_schema(has_context=has_context, has_tools=has_tools),
|
||||
"item_schema": _build_item_schema(
|
||||
has_context=has_context, has_ground_truth=has_ground_truth, has_tools=has_tools
|
||||
),
|
||||
"include_sample_schema": True,
|
||||
},
|
||||
testing_criteria=_build_testing_criteria( # type: ignore[arg-type] # pyright: ignore[reportArgumentType]
|
||||
|
||||
@@ -769,6 +769,10 @@ class TestBuildTestingCriteria:
|
||||
assert c["data_mapping"]["query"] == "{{item.query}}", f"{c['name']}"
|
||||
assert c["data_mapping"]["response"] == "{{item.response}}", f"{c['name']}"
|
||||
|
||||
def test_similarity_includes_ground_truth(self) -> None:
|
||||
criteria = _build_testing_criteria(["similarity"], "gpt-4o", include_data_mapping=True)
|
||||
assert criteria[0]["data_mapping"]["ground_truth"] == "{{item.ground_truth}}"
|
||||
|
||||
def test_all_tool_evaluators_include_tool_definitions(self) -> None:
|
||||
tool_evals = [
|
||||
"tool_call_accuracy",
|
||||
@@ -801,6 +805,10 @@ class TestBuildItemSchema:
|
||||
schema = _build_item_schema(has_tools=True)
|
||||
assert "tool_definitions" in schema["properties"]
|
||||
|
||||
def test_with_ground_truth(self) -> None:
|
||||
schema = _build_item_schema(has_ground_truth=True)
|
||||
assert "ground_truth" in schema["properties"]
|
||||
|
||||
def test_with_context_and_tools(self) -> None:
|
||||
schema = _build_item_schema(has_context=True, has_tools=True)
|
||||
assert "context" in schema["properties"]
|
||||
@@ -1015,6 +1023,50 @@ class TestFoundryEvals:
|
||||
assert ds["type"] == "jsonl"
|
||||
assert "tool_definitions" in ds["source"]["content"][0]["item"]
|
||||
|
||||
async def test_evaluate_ground_truth_in_dataset(self) -> None:
|
||||
"""Items with expected_output include ground_truth in the JSONL payload."""
|
||||
mock_client = MagicMock()
|
||||
|
||||
mock_eval = MagicMock()
|
||||
mock_eval.id = "eval_gt"
|
||||
mock_client.evals.create = AsyncMock(return_value=mock_eval)
|
||||
|
||||
mock_run = MagicMock()
|
||||
mock_run.id = "run_gt"
|
||||
mock_client.evals.runs.create = AsyncMock(return_value=mock_run)
|
||||
|
||||
mock_completed = MagicMock()
|
||||
mock_completed.status = "completed"
|
||||
mock_completed.result_counts = _rc(passed=1)
|
||||
mock_completed.report_url = None
|
||||
mock_completed.per_testing_criteria_results = None
|
||||
mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed)
|
||||
|
||||
items = [
|
||||
EvalItem(
|
||||
conversation=[Message("user", ["What is 2+2?"]), Message("assistant", ["4"])],
|
||||
expected_output="4",
|
||||
),
|
||||
]
|
||||
|
||||
fe = FoundryEvals(
|
||||
client=mock_client,
|
||||
model="gpt-4o",
|
||||
evaluators=[FoundryEvals.SIMILARITY],
|
||||
)
|
||||
await fe.evaluate(items)
|
||||
|
||||
# Verify ground_truth appears in JSONL data
|
||||
run_call = mock_client.evals.runs.create.call_args
|
||||
ds = run_call.kwargs["data_source"]
|
||||
assert ds["type"] == "jsonl"
|
||||
assert ds["source"]["content"][0]["item"]["ground_truth"] == "4"
|
||||
|
||||
# Verify item_schema includes ground_truth
|
||||
create_call = mock_client.evals.create.call_args
|
||||
schema = create_call.kwargs["data_source_config"]["item_schema"]
|
||||
assert "ground_truth" in schema["properties"]
|
||||
|
||||
async def test_evaluate_image_content_in_dataset(self) -> None:
|
||||
"""Image content in conversations is preserved in the JSONL payload."""
|
||||
mock_client = MagicMock()
|
||||
@@ -1988,6 +2040,102 @@ class TestEvaluateWorkflow:
|
||||
"researcher has tools — should get tool_call_accuracy"
|
||||
)
|
||||
|
||||
async def test_expected_output_stamps_overall_items(self) -> None:
|
||||
"""expected_output is stamped on overall items as ground_truth in the dataset."""
|
||||
mock_oai = self._mock_oai_client()
|
||||
|
||||
aer = _make_agent_exec_response("agent", "Response", ["Query"])
|
||||
final_output = [Message("assistant", ["Final answer"])]
|
||||
|
||||
events = [
|
||||
WorkflowEvent.executor_invoked("agent", "Test query"),
|
||||
WorkflowEvent.executor_completed("agent", [aer]),
|
||||
WorkflowEvent.output("end", final_output),
|
||||
]
|
||||
wf_result = WorkflowRunResult(events, [])
|
||||
|
||||
mock_workflow = MagicMock()
|
||||
mock_workflow.executors = {}
|
||||
mock_workflow.run = AsyncMock(return_value=wf_result)
|
||||
|
||||
results = await evaluate_workflow(
|
||||
workflow=mock_workflow,
|
||||
queries=["Test query"],
|
||||
expected_output=["Expected answer"],
|
||||
evaluators=FoundryEvals(
|
||||
client=mock_oai,
|
||||
model="gpt-4o",
|
||||
evaluators=[FoundryEvals.SIMILARITY],
|
||||
),
|
||||
)
|
||||
|
||||
assert results[0].status == "completed"
|
||||
|
||||
# Verify overall eval's dataset includes ground_truth
|
||||
# The overall eval is the last evals.runs.create call
|
||||
calls = mock_oai.evals.runs.create.call_args_list
|
||||
overall_call = calls[-1]
|
||||
ds = overall_call.kwargs["data_source"]
|
||||
overall_item = ds["source"]["content"][0]["item"]
|
||||
assert overall_item["ground_truth"] == "Expected answer"
|
||||
|
||||
async def test_expected_output_with_num_repetitions(self) -> None:
|
||||
"""expected_output is correctly stamped on overall items across multiple repetitions."""
|
||||
mock_oai = self._mock_oai_client()
|
||||
|
||||
aer = _make_agent_exec_response("agent", "Response", ["Query"])
|
||||
final_output = [Message("assistant", ["Final answer"])]
|
||||
|
||||
events = [
|
||||
WorkflowEvent.executor_invoked("agent", "Test query"),
|
||||
WorkflowEvent.executor_completed("agent", [aer]),
|
||||
WorkflowEvent.output("end", final_output),
|
||||
]
|
||||
wf_result = WorkflowRunResult(events, [])
|
||||
|
||||
mock_workflow = MagicMock()
|
||||
mock_workflow.executors = {}
|
||||
mock_workflow.run = AsyncMock(return_value=wf_result)
|
||||
|
||||
results = await evaluate_workflow(
|
||||
workflow=mock_workflow,
|
||||
queries=["Test query"],
|
||||
expected_output=["Expected answer"],
|
||||
evaluators=FoundryEvals(
|
||||
client=mock_oai,
|
||||
model="gpt-4o",
|
||||
evaluators=[FoundryEvals.SIMILARITY],
|
||||
),
|
||||
num_repetitions=2,
|
||||
)
|
||||
|
||||
assert results[0].status == "completed"
|
||||
|
||||
# workflow.run should be called twice (once per repetition)
|
||||
assert mock_workflow.run.call_count == 2
|
||||
|
||||
# Verify all overall items have ground_truth stamped
|
||||
calls = mock_oai.evals.runs.create.call_args_list
|
||||
overall_call = calls[-1]
|
||||
ds = overall_call.kwargs["data_source"]
|
||||
items = ds["source"]["content"]
|
||||
assert len(items) == 2
|
||||
for item in items:
|
||||
assert item["item"]["ground_truth"] == "Expected answer"
|
||||
|
||||
async def test_expected_output_length_mismatch_raises(self) -> None:
|
||||
"""Mismatched queries and expected_output lengths raise ValueError."""
|
||||
mock_oai = MagicMock()
|
||||
mock_workflow = MagicMock()
|
||||
|
||||
with pytest.raises(ValueError, match="expected_output"):
|
||||
await evaluate_workflow(
|
||||
workflow=mock_workflow,
|
||||
queries=["q1", "q2"],
|
||||
expected_output=["e1"],
|
||||
evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# EvalItemResult and EvalScoreResult
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
|
||||
"""Evaluate an agent using Azure AI Foundry's built-in evaluators.
|
||||
|
||||
This sample demonstrates two patterns:
|
||||
This sample demonstrates three patterns:
|
||||
1. evaluate_agent(responses=...) — Evaluate a response you already have.
|
||||
2. evaluate_agent(queries=...) — Run the agent against test queries and evaluate in one call.
|
||||
3. Similarity — Compare agent output against ground-truth reference answers.
|
||||
|
||||
See ``evaluate_tool_calls_sample.py`` for tool-call accuracy evaluation.
|
||||
|
||||
@@ -149,6 +150,41 @@ async def main() -> None:
|
||||
else:
|
||||
print(f"[FAIL] {r.failed} failed")
|
||||
|
||||
# =========================================================================
|
||||
# Pattern 3: Similarity — compare agent output to ground-truth answers
|
||||
# =========================================================================
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Pattern 3: Similarity evaluation with ground truth")
|
||||
print("=" * 60)
|
||||
|
||||
# Similarity requires expected_output — a reference answer per query
|
||||
# that the evaluator compares against the agent's actual response.
|
||||
results = await evaluate_agent(
|
||||
agent=agent,
|
||||
queries=[
|
||||
"What's the weather like in Seattle?",
|
||||
"How much does a flight from Seattle to Paris cost?",
|
||||
],
|
||||
expected_output=[
|
||||
"62°F, cloudy with a chance of rain",
|
||||
"Flights from Seattle to Paris: $450 round-trip",
|
||||
],
|
||||
evaluators=FoundryEvals(
|
||||
client=chat_client,
|
||||
evaluators=[FoundryEvals.SIMILARITY],
|
||||
),
|
||||
)
|
||||
|
||||
for r in results:
|
||||
print(f"Status: {r.status}")
|
||||
print(f"Results: {r.passed}/{r.total} passed")
|
||||
print(f"Portal: {r.report_url}")
|
||||
if r.all_passed:
|
||||
print("[PASS] All passed")
|
||||
else:
|
||||
print(f"[FAIL] {r.failed} failed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -2,11 +2,12 @@
|
||||
|
||||
"""Evaluate a multi-agent workflow using Azure AI Foundry evaluators.
|
||||
|
||||
This sample demonstrates two patterns:
|
||||
This sample demonstrates three patterns:
|
||||
1. Post-hoc: Run the workflow, then evaluate the result you already have.
|
||||
2. Run + evaluate: Pass queries and let evaluate_workflow() run the workflow for you.
|
||||
3. Similarity: Evaluate the workflow's final output against ground-truth reference answers.
|
||||
|
||||
Both patterns return a list of results (one per provider), each with a per-agent
|
||||
Patterns 1 & 2 return a list of results (one per provider), each with a per-agent
|
||||
breakdown in sub_results so you can identify which agent is underperforming.
|
||||
|
||||
Prerequisites:
|
||||
@@ -79,7 +80,6 @@ async def main() -> None:
|
||||
|
||||
# 4. Create the evaluator — provider config goes here, once
|
||||
evals = FoundryEvals(client=client)
|
||||
|
||||
# =========================================================================
|
||||
# Pattern 1: Post-hoc — evaluate a workflow run you already did
|
||||
# =========================================================================
|
||||
@@ -143,6 +143,43 @@ async def main() -> None:
|
||||
if agent_eval.report_url:
|
||||
print(f" Portal: {agent_eval.report_url}")
|
||||
|
||||
# =========================================================================
|
||||
# Pattern 3: Similarity — compare workflow output to ground-truth answers
|
||||
# =========================================================================
|
||||
# Build a fresh workflow to avoid stale session state from Pattern 2.
|
||||
workflow3 = SequentialBuilder(participants=[researcher, planner]).build()
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Pattern 3: Similarity evaluation with ground truth")
|
||||
print("=" * 60)
|
||||
|
||||
# Similarity compares the final workflow output against a reference answer,
|
||||
# so per-agent breakdown is disabled — individual agents don't have their
|
||||
# own ground-truth targets.
|
||||
eval_results = await evaluate_workflow(
|
||||
workflow=workflow3,
|
||||
queries=[
|
||||
"Plan a trip from Seattle to Paris",
|
||||
"Plan a trip from London to Tokyo",
|
||||
],
|
||||
expected_output=[
|
||||
"Pack layers and an umbrella for Paris. Flights from Seattle are around $450 round-trip.",
|
||||
"Bring warm clothing for Tokyo in spring. Flights from London are around $500 round-trip.",
|
||||
],
|
||||
evaluators=FoundryEvals(
|
||||
client=client,
|
||||
evaluators=[FoundryEvals.SIMILARITY],
|
||||
),
|
||||
include_per_agent=False,
|
||||
)
|
||||
|
||||
for r in eval_results:
|
||||
print(f"\nOverall: {r.status}")
|
||||
print(f" Passed: {r.passed}/{r.total}")
|
||||
if r.report_url:
|
||||
print(f" Portal: {r.report_url}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -173,4 +210,12 @@ Overall: completed
|
||||
Per-agent breakdown:
|
||||
researcher: 2/2 passed
|
||||
planner: 2/2 passed
|
||||
|
||||
============================================================
|
||||
Pattern 3: Similarity evaluation with ground truth
|
||||
============================================================
|
||||
|
||||
Overall: completed
|
||||
Passed: 2/2
|
||||
Portal: https://ai.azure.com/...
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user