mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
Python: include tool definitions for Foundry agent evals (#5974)
This commit is contained in:
committed by
GitHub
Unverified
parent
289cafcf36
commit
46ed66cfd5
@@ -75,6 +75,15 @@ _TOOL_EVALUATORS: set[str] = {
|
||||
"builtin.tool_call_success",
|
||||
}
|
||||
|
||||
# Evaluators that accept tool_definitions in their data mapping when the
|
||||
# evaluated items include tools.
|
||||
_TOOL_DEFINITION_EVALUATORS: set[str] = _TOOL_EVALUATORS | {
|
||||
"builtin.intent_resolution",
|
||||
"builtin.task_adherence",
|
||||
"builtin.task_completion",
|
||||
"builtin.task_navigation_efficiency",
|
||||
}
|
||||
|
||||
# Evaluators that require a ground_truth / expected_output field.
|
||||
_GROUND_TRUTH_EVALUATORS: set[str] = {
|
||||
"builtin.similarity",
|
||||
@@ -161,6 +170,7 @@ def _build_testing_criteria(
|
||||
model: str,
|
||||
*,
|
||||
include_data_mapping: bool = False,
|
||||
include_tool_definitions: bool = False,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Build ``testing_criteria`` for ``evals.create()``.
|
||||
|
||||
@@ -169,6 +179,8 @@ def _build_testing_criteria(
|
||||
model: Model deployment for the LLM judge.
|
||||
include_data_mapping: Whether to include field-level data mapping
|
||||
(required for the JSONL data source, not needed for response-based).
|
||||
include_tool_definitions: Whether the mapped data items include tool
|
||||
definitions.
|
||||
"""
|
||||
criteria: list[dict[str, Any]] = []
|
||||
for name in evaluators:
|
||||
@@ -203,7 +215,7 @@ def _build_testing_criteria(
|
||||
mapping["context"] = "{{item.context}}"
|
||||
if qualified in _GROUND_TRUTH_EVALUATORS:
|
||||
mapping["ground_truth"] = "{{item.ground_truth}}"
|
||||
if qualified in _TOOL_EVALUATORS:
|
||||
if include_tool_definitions and qualified in _TOOL_DEFINITION_EVALUATORS:
|
||||
mapping["tool_definitions"] = "{{item.tool_definitions}}"
|
||||
entry["data_mapping"] = mapping
|
||||
|
||||
@@ -713,6 +725,7 @@ class FoundryEvals:
|
||||
evaluators,
|
||||
self._model,
|
||||
include_data_mapping=True,
|
||||
include_tool_definitions=has_tools,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -745,7 +745,12 @@ class TestBuildTestingCriteria:
|
||||
assert "conversation" not in criteria[1]["data_mapping"]
|
||||
|
||||
def test_tool_evaluator_includes_tool_definitions(self) -> None:
|
||||
criteria = _build_testing_criteria(["relevance", "tool_call_accuracy"], "gpt-4o", include_data_mapping=True)
|
||||
criteria = _build_testing_criteria(
|
||||
["relevance", "tool_call_accuracy"],
|
||||
"gpt-4o",
|
||||
include_data_mapping=True,
|
||||
include_tool_definitions=True,
|
||||
)
|
||||
# relevance: string query/response
|
||||
assert criteria[0]["data_mapping"]["query"] == "{{item.query}}"
|
||||
assert criteria[0]["data_mapping"]["response"] == "{{item.response}}"
|
||||
@@ -762,6 +767,17 @@ class TestBuildTestingCriteria:
|
||||
assert c["data_mapping"]["query"] == "{{item.query_messages}}", f"{c['name']}"
|
||||
assert c["data_mapping"]["response"] == "{{item.response_messages}}", f"{c['name']}"
|
||||
|
||||
def test_agent_evaluators_include_tool_definitions_when_tools_present(self) -> None:
|
||||
agent_evals = ["task_adherence", "intent_resolution", "task_completion", "task_navigation_efficiency"]
|
||||
criteria = _build_testing_criteria(
|
||||
agent_evals,
|
||||
"gpt-4o",
|
||||
include_data_mapping=True,
|
||||
include_tool_definitions=True,
|
||||
)
|
||||
for c in criteria:
|
||||
assert c["data_mapping"]["tool_definitions"] == "{{item.tool_definitions}}", f"{c['name']}"
|
||||
|
||||
def test_quality_evaluators_use_strings(self) -> None:
|
||||
quality_evals = ["coherence", "relevance", "fluency"]
|
||||
criteria = _build_testing_criteria(quality_evals, "gpt-4o", include_data_mapping=True)
|
||||
@@ -781,7 +797,12 @@ class TestBuildTestingCriteria:
|
||||
"tool_output_utilization",
|
||||
"tool_call_success",
|
||||
]
|
||||
criteria = _build_testing_criteria(tool_evals, "gpt-4o", include_data_mapping=True)
|
||||
criteria = _build_testing_criteria(
|
||||
tool_evals,
|
||||
"gpt-4o",
|
||||
include_data_mapping=True,
|
||||
include_tool_definitions=True,
|
||||
)
|
||||
for c in criteria:
|
||||
assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user