Python: include tool definitions for Foundry agent evals (#5974)

This commit is contained in:
Yufeng He
2026-05-22 00:23:36 +08:00
committed by GitHub
Unverified
parent 289cafcf36
commit 46ed66cfd5
2 changed files with 37 additions and 3 deletions
@@ -75,6 +75,15 @@ _TOOL_EVALUATORS: set[str] = {
"builtin.tool_call_success",
}
# Evaluators that accept tool_definitions in their data mapping when the
# evaluated items include tools.
_TOOL_DEFINITION_EVALUATORS: set[str] = _TOOL_EVALUATORS | {
"builtin.intent_resolution",
"builtin.task_adherence",
"builtin.task_completion",
"builtin.task_navigation_efficiency",
}
# Evaluators that require a ground_truth / expected_output field.
_GROUND_TRUTH_EVALUATORS: set[str] = {
"builtin.similarity",
@@ -161,6 +170,7 @@ def _build_testing_criteria(
model: str,
*,
include_data_mapping: bool = False,
include_tool_definitions: bool = False,
) -> list[dict[str, Any]]:
"""Build ``testing_criteria`` for ``evals.create()``.
@@ -169,6 +179,8 @@ def _build_testing_criteria(
model: Model deployment for the LLM judge.
include_data_mapping: Whether to include field-level data mapping
(required for the JSONL data source, not needed for response-based).
include_tool_definitions: Whether the mapped data items include tool
definitions.
"""
criteria: list[dict[str, Any]] = []
for name in evaluators:
@@ -203,7 +215,7 @@ def _build_testing_criteria(
mapping["context"] = "{{item.context}}"
if qualified in _GROUND_TRUTH_EVALUATORS:
mapping["ground_truth"] = "{{item.ground_truth}}"
if qualified in _TOOL_EVALUATORS:
if include_tool_definitions and qualified in _TOOL_DEFINITION_EVALUATORS:
mapping["tool_definitions"] = "{{item.tool_definitions}}"
entry["data_mapping"] = mapping
@@ -713,6 +725,7 @@ class FoundryEvals:
evaluators,
self._model,
include_data_mapping=True,
include_tool_definitions=has_tools,
),
)
@@ -745,7 +745,12 @@ class TestBuildTestingCriteria:
assert "conversation" not in criteria[1]["data_mapping"]
def test_tool_evaluator_includes_tool_definitions(self) -> None:
criteria = _build_testing_criteria(["relevance", "tool_call_accuracy"], "gpt-4o", include_data_mapping=True)
criteria = _build_testing_criteria(
["relevance", "tool_call_accuracy"],
"gpt-4o",
include_data_mapping=True,
include_tool_definitions=True,
)
# relevance: string query/response
assert criteria[0]["data_mapping"]["query"] == "{{item.query}}"
assert criteria[0]["data_mapping"]["response"] == "{{item.response}}"
@@ -762,6 +767,17 @@ class TestBuildTestingCriteria:
assert c["data_mapping"]["query"] == "{{item.query_messages}}", f"{c['name']}"
assert c["data_mapping"]["response"] == "{{item.response_messages}}", f"{c['name']}"
def test_agent_evaluators_include_tool_definitions_when_tools_present(self) -> None:
agent_evals = ["task_adherence", "intent_resolution", "task_completion", "task_navigation_efficiency"]
criteria = _build_testing_criteria(
agent_evals,
"gpt-4o",
include_data_mapping=True,
include_tool_definitions=True,
)
for c in criteria:
assert c["data_mapping"]["tool_definitions"] == "{{item.tool_definitions}}", f"{c['name']}"
def test_quality_evaluators_use_strings(self) -> None:
quality_evals = ["coherence", "relevance", "fluency"]
criteria = _build_testing_criteria(quality_evals, "gpt-4o", include_data_mapping=True)
@@ -781,7 +797,12 @@ class TestBuildTestingCriteria:
"tool_output_utilization",
"tool_call_success",
]
criteria = _build_testing_criteria(tool_evals, "gpt-4o", include_data_mapping=True)
criteria = _build_testing_criteria(
tool_evals,
"gpt-4o",
include_data_mapping=True,
include_tool_definitions=True,
)
for c in criteria:
assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions"