From c263b89af3298f517bb6fac9bfb889a900092f9b Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Thu, 12 Feb 2026 17:16:34 +0200 Subject: [PATCH 1/3] Handle intermediate responses and mcp tool approvals --- .../_evaluators/_common/_base_prompty_eval.py | 103 +++++++++++++++++- .../_intent_resolution/_intent_resolution.py | 20 ++++ .../_evaluators/_relevance/_relevance.py | 19 ++++ .../_response_completeness.py | 19 ++++ .../_task_adherence/_task_adherence.py | 19 ++++ .../_task_completion/_task_completion.py | 20 ++++ .../_tool_call_success/_tool_call_success.py | 20 +++- 7 files changed, 214 insertions(+), 6 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py index 3a985afbd42e..59e2a2c9a763 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py @@ -34,6 +34,71 @@ def value(self) -> str: T = TypeVar("T") +def _is_intermediate_response(response): + """Check if response is intermediate (last content item is function_call or mcp_approval_request).""" + if isinstance(response, list) and len(response) > 0: + last_msg = response[-1] + if isinstance(last_msg, dict) and last_msg.get("role") == "assistant": + content = last_msg.get("content", []) + if isinstance(content, list) and len(content) > 0: + last_content = content[-1] + if isinstance(last_content, dict) and last_content.get("type") in ( + "function_call", + "mcp_approval_request", + ): + return True + return False + + +def _drop_mcp_approval_messages(messages): + """Remove MCP approval request/response messages.""" + if not isinstance(messages, list): + return messages + return [ + msg + for msg in messages + if not ( + isinstance(msg, dict) + and isinstance(msg.get("content"), list) + and ( + ( + msg.get("role") == "assistant" + and any(isinstance(c, dict) and c.get("type") == "mcp_approval_request" for c in msg["content"]) + ) + or ( + msg.get("role") == "tool" + and any(isinstance(c, dict) and c.get("type") == "mcp_approval_response" for c in msg["content"]) + ) + ) + ) + ] + + +def _normalize_function_call_types(messages): + """Normalize function_call/function_call_output types to tool_call/tool_result.""" + if not isinstance(messages, list): + return messages + for msg in messages: + if isinstance(msg, dict) and isinstance(msg.get("content"), list): + for item in msg["content"]: + if isinstance(item, dict) and item.get("type") == "function_call": + item["type"] = "tool_call" + if "function_call" in item: + item["tool_call"] = item.pop("function_call") + elif isinstance(item, dict) and item.get("type") == "function_call_output": + item["type"] = "tool_result" + if "function_call_output" in item: + item["tool_result"] = item.pop("function_call_output") + return messages + + +def _preprocess_messages(messages): + """Drop MCP approval messages and normalize function call types.""" + messages = _drop_mcp_approval_messages(messages) + messages = _normalize_function_call_types(messages) + return messages + + class PromptyEvaluatorBase(EvaluatorBase[T]): """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators make use of a prompty file, and return their results as a dictionary, with a single key-value pair @@ -133,6 +198,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t category=ErrorCategory.INVALID_VALUE, target=ErrorTarget.CONVERSATION, ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + # Call the prompty flow to get the evaluation result. prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) @@ -323,7 +402,7 @@ def _extract_needed_tool_definitions( return needed_tool_definitions def _not_applicable_result( - self, error_message: str, threshold: Union[int, float] + self, error_message: str, threshold: Union[int, float], has_details: bool = False ) -> Dict[str, Union[str, float, Dict]]: """Return a result indicating that the evaluation is not applicable. @@ -331,14 +410,28 @@ def _not_applicable_result( :type error_message: str :param threshold: The threshold value for the evaluator. :type threshold: Union[int, float] + :param has_details: Whether to include an empty details field in the result. + :type has_details: bool :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, Dict]] """ # If no tool calls were made or tool call type is not supported, return not applicable result - return { - self._result_key: self._NOT_APPLICABLE_RESULT, + result = { + self._result_key: threshold, f"{self._result_key}_result": "pass", f"{self._result_key}_threshold": threshold, - f"{self._result_key}_reason": error_message, - f"{self._result_key}_details": {}, + f"{self._result_key}_reason": f"Not applicable: {error_message}", + f"{self._result_key}_prompt_tokens": 0, + f"{self._result_key}_completion_tokens": 0, + f"{self._result_key}_total_tokens": 0, + f"{self._result_key}_finish_reason": "", + f"{self._result_key}_model": "", + f"{self._result_key}_sample_input": "", + f"{self._result_key}_sample_output": "", } + + # Add empty details field if requested + if has_details: + result[f"{self._result_key}_details"] = {} + + return result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py index fa75f3f3b892..56f3207d0386 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py @@ -155,6 +155,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + # we override the _do_eval method as we want the output to be a dictionary, which is a different schema than _base_prompty_eval.py if "query" not in eval_input and "response" not in eval_input: raise EvaluationException( @@ -164,6 +170,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t category=ErrorCategory.MISSING_FIELD, target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR, ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + # reformat query and response to the format expected by the prompty flow eval_input["query"] = reformat_conversation_history(eval_input["query"], logger) eval_input["response"] = reformat_agent_response(eval_input["response"], logger) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index 27bb6913d0c6..f4435e969e2e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -184,6 +184,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + if "query" not in eval_input and "response" not in eval_input: raise EvaluationException( message="Only text conversation inputs are supported.", @@ -192,6 +198,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t category=ErrorCategory.INVALID_VALUE, target=ErrorTarget.CONVERSATION, ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) if not isinstance(eval_input["query"], str): eval_input["query"] = reformat_conversation_history(eval_input["query"], logger) if not isinstance(eval_input["response"], str): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py index daf4534e3058..7f0fa765f9c8 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py @@ -149,6 +149,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + # we override the _do_eval method as we want the output to be a dictionary, # which is a different schema than _base_prompty_eval.py if "ground_truth" not in eval_input or "response" not in eval_input: @@ -161,6 +167,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t target=ErrorTarget.COMPLETENESS_EVALUATOR, ) + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("ground_truth"), list): + eval_input["ground_truth"] = _preprocess_messages(eval_input["ground_truth"]) + result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) llm_output = result.get("llm_output") if isinstance(result, dict) else result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py index eb1f260f1e82..6d20dce9cff4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py @@ -153,6 +153,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]] :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + # we override the _do_eval method as we want the output to be a dictionary, # which is a different schema than _base_prompty_eval.py if "query" not in eval_input or "response" not in eval_input: @@ -164,6 +170,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]] target=ErrorTarget.TASK_ADHERENCE_EVALUATOR, ) + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + # Reformat conversation history and extract system message query_messages = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True) system_message = "" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py index d24688f45804..038c62d7ca24 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py @@ -155,6 +155,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + # we override the _do_eval method as we want the output to be a dictionary, # which is a different schema than _base_prompty_eval.py if "query" not in eval_input and "response" not in eval_input: @@ -165,6 +171,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t category=ErrorCategory.MISSING_FIELD, target=ErrorTarget.TASK_COMPLETION_EVALUATOR, ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True) eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True) if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 4521996dd384..94046bda5464 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -16,7 +16,6 @@ from azure.ai.evaluation._evaluators._common._validators import ToolDefinitionsValidator, ValidatorInterface from azure.ai.evaluation._common._experimental import experimental - logger = logging.getLogger(__name__) @@ -151,6 +150,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + if "response" not in eval_input: raise EvaluationException( message="response is a required input to the Tool Call Success evaluator.", @@ -168,6 +173,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, ) + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger) if "tool_definitions" in eval_input: From 36d4153c456cfc544b6190a79a5a2a214335313b Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Thu, 12 Feb 2026 17:25:54 +0200 Subject: [PATCH 2/3] add handling in missing evals --- .../_tool_call_accuracy.py | 21 +++++++++++++++++++ .../_tool_input_accuracy.py | 21 +++++++++++++++++++ .../_tool_output_utilization.py | 19 +++++++++++++++++ .../_tool_selection/_tool_selection.py | 21 +++++++++++++++++++ 4 files changed, 82 insertions(+) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 392a5fe1c86c..1b78edd852c1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -212,6 +212,24 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self.threshold, + has_details=True, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to the Tool Call Accuracy evaluator."), @@ -221,6 +239,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, ) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + # Single LLM call for all tool calls prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) llm_output = prompty_output_dict.get("llm_output", {}) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index b792177cb43f..db6f366e3b8c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -162,6 +162,24 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float]] """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + 1, + has_details=True, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to " "the Tool Input Accuracy evaluator."), @@ -171,6 +189,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, ) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + # Format conversation history for cleaner evaluation eval_input["query"] = reformat_conversation_history( eval_input["query"], logger, include_system_messages=True, include_tool_messages=True diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py index 1e4ae3287508..d29360b9752b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py @@ -168,6 +168,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + # we override the _do_eval method as we want the output to be a dictionary, # which is a different schema than _base_prompty_eval.py if ("query" not in eval_input) and ("response" not in eval_input) and ("tool_definitions" not in eval_input): @@ -178,6 +184,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t category=ErrorCategory.MISSING_FIELD, target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) tool_definitions = eval_input["tool_definitions"] filtered_tool_definitions = filter_to_used_tools( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index 183f659ba860..3741f02b917a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -182,6 +182,24 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float]] """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + 1, + has_details=True, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to the Tool Selection evaluator."), @@ -191,6 +209,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: target=ErrorTarget.TOOL_SELECTION_EVALUATOR, ) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + # Format conversation history for cleaner evaluation eval_input["query"] = reformat_conversation_history( eval_input["query"], logger, include_system_messages=True, include_tool_messages=True From 2985a152d415452769116cf80290e91b366c0773 Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Thu, 12 Feb 2026 17:27:47 +0200 Subject: [PATCH 3/3] black --- .../_evaluators/_tool_call_accuracy/_tool_call_accuracy.py | 6 +++--- .../_tool_input_accuracy/_tool_input_accuracy.py | 6 +++--- .../_tool_output_utilization/_tool_output_utilization.py | 6 +++--- .../_evaluators/_tool_selection/_tool_selection.py | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 1b78edd852c1..58d132e5b80a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -217,7 +217,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t _is_intermediate_response, _preprocess_messages, ) - + # Check for intermediate response if _is_intermediate_response(eval_input.get("response")): return self._not_applicable_result( @@ -225,11 +225,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t self.threshold, has_details=True, ) - + # Preprocess messages if they are lists if isinstance(eval_input.get("response"), list): eval_input["response"] = _preprocess_messages(eval_input["response"]) - + if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to the Tool Call Accuracy evaluator."), diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index db6f366e3b8c..33b8eab33611 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -167,7 +167,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: _is_intermediate_response, _preprocess_messages, ) - + # Check for intermediate response if _is_intermediate_response(eval_input.get("response")): return self._not_applicable_result( @@ -175,11 +175,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: 1, has_details=True, ) - + # Preprocess messages if they are lists if isinstance(eval_input.get("response"), list): eval_input["response"] = _preprocess_messages(eval_input["response"]) - + if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to " "the Tool Input Accuracy evaluator."), diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py index d29360b9752b..ae33b9d6c396 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py @@ -173,7 +173,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t _is_intermediate_response, _preprocess_messages, ) - + # we override the _do_eval method as we want the output to be a dictionary, # which is a different schema than _base_prompty_eval.py if ("query" not in eval_input) and ("response" not in eval_input) and ("tool_definitions" not in eval_input): @@ -184,14 +184,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t category=ErrorCategory.MISSING_FIELD, target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, ) - + # Check for intermediate response if _is_intermediate_response(eval_input.get("response")): return self._not_applicable_result( "Intermediate response. Please provide the agent's final response for evaluation.", self._threshold, ) - + # Preprocess messages if they are lists if isinstance(eval_input.get("response"), list): eval_input["response"] = _preprocess_messages(eval_input["response"]) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index 3741f02b917a..61f3367cc0cd 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -187,7 +187,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: _is_intermediate_response, _preprocess_messages, ) - + # Check for intermediate response if _is_intermediate_response(eval_input.get("response")): return self._not_applicable_result( @@ -195,11 +195,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: 1, has_details=True, ) - + # Preprocess messages if they are lists if isinstance(eval_input.get("response"), list): eval_input["response"] = _preprocess_messages(eval_input["response"]) - + if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to the Tool Selection evaluator."),