diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py index a2da8939cf70..cb1bcb708860 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py @@ -5,7 +5,7 @@ Validator for conversation-style query and response inputs. """ -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional from typing_extensions import override from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget from ._validation_constants import MessageRole, ContentType @@ -18,13 +18,34 @@ class ConversationValidator(ValidatorInterface): """ requires_query: bool = True + check_for_unsupported_tools: bool = False error_target: ErrorTarget - def __init__(self, error_target: ErrorTarget, requires_query: bool = True): + UNSUPPORTED_TOOLS: List[str] = [ + "web_search_call", "code_interpreter_call", + "azure_ai_search_call", "bing_grounding_call", + "bing_custom_search_preview_call", "azure_fabric", + "sharepoint_grounding", "browser_automation", + "openapi_call" + ] + + def __init__(self, error_target: ErrorTarget, requires_query: bool = True, check_for_unsupported_tools: bool = False): """Initialize with error target and query requirement.""" self.requires_query = requires_query + self.check_for_unsupported_tools = check_for_unsupported_tools self.error_target = error_target + def _validate_field_exists(self, item: Dict[str, Any], field_name: str, context: str) -> Optional[EvaluationException]: + """Validate that a field exists in a dictionary.""" + if field_name not in item: + return EvaluationException( + message=f"Each {context} must contain a '{field_name}' field.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + return None + def _validate_string_field( self, item: Dict[str, Any], field_name: str, context: str ) -> Optional[EvaluationException]: @@ -49,7 +70,7 @@ def _validate_string_field( def _validate_list_field( self, item: Dict[str, Any], field_name: str, context: str ) -> Optional[EvaluationException]: - """Validate that a field exists and is a dictionary.""" + """Validate that a field exists and is a list.""" if field_name not in item: return EvaluationException( message=f"Each {context} must contain a '{field_name}' field.", @@ -109,14 +130,19 @@ def _validate_text_content_item(self, content_item: Dict[str, Any], role: str) - def _validate_tool_call_content_item(self, content_item: Dict[str, Any]) -> Optional[EvaluationException]: """Validate a tool_call content item.""" - if "type" not in content_item or content_item["type"] != ContentType.TOOL_CALL: + valid_tool_call_content_types = [ContentType.TOOL_CALL, ContentType.FUNCTION_CALL, ContentType.OPENAPI_CALL, ContentType.MCP_APPROVAL_REQUEST] + valid_tool_call_content_types_as_strings = [t.value for t in valid_tool_call_content_types] + if "type" not in content_item or content_item["type"] not in valid_tool_call_content_types: return EvaluationException( - message=f"The content item must be of type '{ContentType.TOOL_CALL.value}' in tool_call content item.", + message=f"The content item must be of type {valid_tool_call_content_types_as_strings} in tool_call content item.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, ) + if content_item["type"] == ContentType.MCP_APPROVAL_REQUEST: + return None + error = self._validate_string_field(content_item, "name", "tool_call content items") if error: return error @@ -156,13 +182,14 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu """Validate assistant message content.""" content = message["content"] + valid_assistant_content_types = [ContentType.TEXT, ContentType.OUTPUT_TEXT, ContentType.TOOL_CALL, ContentType.FUNCTION_CALL, ContentType.MCP_APPROVAL_REQUEST, ContentType.OPENAPI_CALL] + valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types] if isinstance(content, list): for content_item in content: content_type = content_item["type"] - valid_assistant_content_types = [ContentType.TEXT, ContentType.OUTPUT_TEXT, ContentType.TOOL_CALL] if content_type not in valid_assistant_content_types: return EvaluationException( - message=f"Invalid content type '{content_type}' for message with role '{MessageRole.ASSISTANT.value}'. Must be one of {[t.value for t in valid_assistant_content_types]}.", + message=f"Invalid content type '{content_type}' for message with role '{MessageRole.ASSISTANT.value}'. Must be one of {valid_assistant_content_types_as_strings}.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, @@ -172,10 +199,22 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu error = self._validate_text_content_item(content_item, MessageRole.ASSISTANT) if error: return error - else: # must be tool_call + elif content_type in [ContentType.TOOL_CALL, ContentType.FUNCTION_CALL, ContentType.OPENAPI_CALL]: error = self._validate_tool_call_content_item(content_item) if error: return error + + # Raise error in case of unsupported tools for evaluators that enabled check_for_unsupported_tools + if self.check_for_unsupported_tools: + if content_type == ContentType.TOOL_CALL or content_type == ContentType.OPENAPI_CALL: + name = "openapi_call" if content_type == ContentType.OPENAPI_CALL else content_item["name"].lower() + if name in self.UNSUPPORTED_TOOLS: + return EvaluationException( + message=f"{name} tool call is currently not supported for {self.error_target} evaluator.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.NOT_APPLICABLE, + target=self.error_target, + ) return None def _validate_tool_message(self, message: Dict[str, Any]) -> Optional[EvaluationException]: @@ -196,21 +235,30 @@ def _validate_tool_message(self, message: Dict[str, Any]) -> Optional[Evaluation if error: return error + valid_tool_content_types = [ContentType.TOOL_RESULT, ContentType.FUNCTION_CALL_OUTPUT, ContentType.MCP_APPROVAL_RESPONSE, ContentType.OPENAPI_CALL_OUTPUT] + valid_tool_content_types_as_strings = [t.value for t in valid_tool_content_types] for content_item in content: content_type = content_item["type"] - if content_type != ContentType.TOOL_RESULT: + if content_type not in valid_tool_content_types: return EvaluationException( - message=f"Invalid content type '{content_type}' for message with role '{MessageRole.TOOL.value}'. Must be '{ContentType.TOOL_RESULT.value}'.", + message=f"Invalid content type '{content_type}' for message with role '{MessageRole.TOOL.value}'. Must be one of {valid_tool_content_types_as_strings}.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, ) - error = self._validate_dict_field( - content_item, "tool_result", f"content items for role '{MessageRole.TOOL.value}'" - ) - if error: - return error + if content_type in [ContentType.TOOL_RESULT, ContentType.OPENAPI_CALL_OUTPUT]: + error = self._validate_field_exists( + content_item, "tool_result", f"content items for role '{MessageRole.TOOL.value}'" + ) + if error: + return error + elif content_type == ContentType.FUNCTION_CALL_OUTPUT: + error = self._validate_field_exists( + content_item, "function_call_output", f"content items for role '{MessageRole.TOOL.value}'" + ) + if error: + return error return None diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_tool_calls_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_tool_calls_validator.py index 63480bd66c92..b875993c05a3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_tool_calls_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_tool_calls_validator.py @@ -18,8 +18,8 @@ class ToolCallsValidator(ToolDefinitionsValidator): optional_tool_definitions = False - def __init__(self, error_target: ErrorTarget, requires_query: bool = True, optional_tool_definitions: bool = False): - super().__init__(error_target, requires_query, optional_tool_definitions) + def __init__(self, error_target: ErrorTarget, requires_query: bool = True, optional_tool_definitions: bool = False, check_for_unsupported_tools: bool = False): + super().__init__(error_target, requires_query, optional_tool_definitions, check_for_unsupported_tools) def _validate_tool_calls(self, tool_calls) -> Optional[EvaluationException]: """Validate tool calls input.""" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_tool_definitions_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_tool_definitions_validator.py index a8947a18a8e6..ea22d79360c9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_tool_definitions_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_tool_definitions_validator.py @@ -18,8 +18,8 @@ class ToolDefinitionsValidator(ConversationValidator): optional_tool_definitions: bool = True - def __init__(self, error_target: ErrorTarget, requires_query: bool = True, optional_tool_definitions: bool = True): - super().__init__(error_target, requires_query) + def __init__(self, error_target: ErrorTarget, requires_query: bool = True, optional_tool_definitions: bool = True, check_for_unsupported_tools: bool = False): + super().__init__(error_target, requires_query, check_for_unsupported_tools) self.optional_tool_definitions = optional_tool_definitions def _validate_tool_definition(self, tool_definition) -> Optional[EvaluationException]: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py index cd5806fcdafb..f4c242a9f02b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py @@ -25,3 +25,9 @@ class ContentType(str, Enum): OUTPUT_TEXT = "output_text" TOOL_CALL = "tool_call" TOOL_RESULT = "tool_result" + FUNCTION_CALL = "function_call" + FUNCTION_CALL_OUTPUT = "function_call_output" + MCP_APPROVAL_REQUEST = "mcp_approval_request" + MCP_APPROVAL_RESPONSE = "mcp_approval_response" + OPENAPI_CALL = "openapi_call" + OPENAPI_CALL_OUTPUT = "openapi_call_output" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index ffe66ff6be98..70bfeace701e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -114,6 +114,7 @@ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs): self._validator = ConversationValidator( error_target=ErrorTarget.GROUNDEDNESS_EVALUATOR, requires_query=False, + check_for_unsupported_tools=True, ) self._validator_with_query = ConversationValidator( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py index 828530275872..a51f7b2c8b00 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py @@ -79,7 +79,7 @@ class _TaskNavigationEfficiencyEvaluator(EvaluatorBase): {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "call_tool_B", "arguments": {}}]}, {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "response_synthesis", "arguments": {}}]} ], - ground_truth=["identify_tools_to_call", ""call_tool_A", "call_tool_B", "response_synthesis"] + ground_truth=["identify_tools_to_call", "call_tool_A", "call_tool_B", "response_synthesis"] ) # Example 2: Using tool names with parameters (exact parameter matching required) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 392a5fe1c86c..4033fae627d5 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -100,7 +100,10 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, self.threshold = threshold # Initialize input validator - self._validator = ToolCallsValidator(error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR) + self._validator = ToolCallsValidator( + error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + check_for_unsupported_tools=True, + ) super().__init__( model_config=model_config, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 4521996dd384..d5044be333b9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -79,6 +79,7 @@ def __init__(self, model_config, *, credential=None, **kwargs): self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, requires_query=False, + check_for_unsupported_tools=True, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index b792177cb43f..ffdc81c7bd31 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -90,7 +90,8 @@ def __init__( # Initialize input validator self._validator = ToolDefinitionsValidator( - error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, optional_tool_definitions=False + error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, optional_tool_definitions=False, + check_for_unsupported_tools=True, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py index 1e4ae3287508..de951498282e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py @@ -86,7 +86,10 @@ def __init__( prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) # Initialize input validator - self._validator = ToolDefinitionsValidator(error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR) + self._validator = ToolDefinitionsValidator( + error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, + check_for_unsupported_tools=True, + ) super().__init__( model_config=model_config,