From c263b89af3298f517bb6fac9bfb889a900092f9b Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Thu, 12 Feb 2026 17:16:34 +0200
Subject: [PATCH 1/3] Handle intermediate responses and mcp tool approvals

---
 .../_evaluators/_common/_base_prompty_eval.py | 103 +++++++++++++++++-
 .../_intent_resolution/_intent_resolution.py  |  20 ++++
 .../_evaluators/_relevance/_relevance.py      |  19 ++++
 .../_response_completeness.py                 |  19 ++++
 .../_task_adherence/_task_adherence.py        |  19 ++++
 .../_task_completion/_task_completion.py      |  20 ++++
 .../_tool_call_success/_tool_call_success.py  |  20 +++-
 7 files changed, 214 insertions(+), 6 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
index 3a985afbd42e..59e2a2c9a763 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
@@ -34,6 +34,71 @@ def value(self) -> str:
 T = TypeVar("T")
 
 
+def _is_intermediate_response(response):
+    """Check if response is intermediate (last content item is function_call or mcp_approval_request)."""
+    if isinstance(response, list) and len(response) > 0:
+        last_msg = response[-1]
+        if isinstance(last_msg, dict) and last_msg.get("role") == "assistant":
+            content = last_msg.get("content", [])
+            if isinstance(content, list) and len(content) > 0:
+                last_content = content[-1]
+                if isinstance(last_content, dict) and last_content.get("type") in (
+                    "function_call",
+                    "mcp_approval_request",
+                ):
+                    return True
+    return False
+
+
+def _drop_mcp_approval_messages(messages):
+    """Remove MCP approval request/response messages."""
+    if not isinstance(messages, list):
+        return messages
+    return [
+        msg
+        for msg in messages
+        if not (
+            isinstance(msg, dict)
+            and isinstance(msg.get("content"), list)
+            and (
+                (
+                    msg.get("role") == "assistant"
+                    and any(isinstance(c, dict) and c.get("type") == "mcp_approval_request" for c in msg["content"])
+                )
+                or (
+                    msg.get("role") == "tool"
+                    and any(isinstance(c, dict) and c.get("type") == "mcp_approval_response" for c in msg["content"])
+                )
+            )
+        )
+    ]
+
+
+def _normalize_function_call_types(messages):
+    """Normalize function_call/function_call_output types to tool_call/tool_result."""
+    if not isinstance(messages, list):
+        return messages
+    for msg in messages:
+        if isinstance(msg, dict) and isinstance(msg.get("content"), list):
+            for item in msg["content"]:
+                if isinstance(item, dict) and item.get("type") == "function_call":
+                    item["type"] = "tool_call"
+                    if "function_call" in item:
+                        item["tool_call"] = item.pop("function_call")
+                elif isinstance(item, dict) and item.get("type") == "function_call_output":
+                    item["type"] = "tool_result"
+                    if "function_call_output" in item:
+                        item["tool_result"] = item.pop("function_call_output")
+    return messages
+
+
+def _preprocess_messages(messages):
+    """Drop MCP approval messages and normalize function call types."""
+    messages = _drop_mcp_approval_messages(messages)
+    messages = _normalize_function_call_types(messages)
+    return messages
+
+
 class PromptyEvaluatorBase(EvaluatorBase[T]):
     """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
     make use of a prompty file, and return their results as a dictionary, with a single key-value pair
@@ -133,6 +198,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 category=ErrorCategory.INVALID_VALUE,
                 target=ErrorTarget.CONVERSATION,
             )
+
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         # Call the prompty flow to get the evaluation result.
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
 
@@ -323,7 +402,7 @@ def _extract_needed_tool_definitions(
         return needed_tool_definitions
 
     def _not_applicable_result(
-        self, error_message: str, threshold: Union[int, float]
+        self, error_message: str, threshold: Union[int, float], has_details: bool = False
     ) -> Dict[str, Union[str, float, Dict]]:
         """Return a result indicating that the evaluation is not applicable.
 
@@ -331,14 +410,28 @@ def _not_applicable_result(
         :type error_message: str
         :param threshold: The threshold value for the evaluator.
         :type threshold: Union[int, float]
+        :param has_details: Whether to include an empty details field in the result.
+        :type has_details: bool
         :return: A dictionary containing the result of the evaluation.
         :rtype: Dict[str, Union[str, float, Dict]]
         """
         # If no tool calls were made or tool call type is not supported, return not applicable result
-        return {
-            self._result_key: self._NOT_APPLICABLE_RESULT,
+        result = {
+            self._result_key: threshold,
             f"{self._result_key}_result": "pass",
             f"{self._result_key}_threshold": threshold,
-            f"{self._result_key}_reason": error_message,
-            f"{self._result_key}_details": {},
+            f"{self._result_key}_reason": f"Not applicable: {error_message}",
+            f"{self._result_key}_prompt_tokens": 0,
+            f"{self._result_key}_completion_tokens": 0,
+            f"{self._result_key}_total_tokens": 0,
+            f"{self._result_key}_finish_reason": "",
+            f"{self._result_key}_model": "",
+            f"{self._result_key}_sample_input": "",
+            f"{self._result_key}_sample_output": "",
         }
+
+        # Add empty details field if requested
+        if has_details:
+            result[f"{self._result_key}_details"] = {}
+
+        return result
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py
index fa75f3f3b892..56f3207d0386 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py
@@ -155,6 +155,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         # we override the _do_eval method as we want the output to be a dictionary, which is a different schema than _base_prompty_eval.py
         if "query" not in eval_input and "response" not in eval_input:
             raise EvaluationException(
@@ -164,6 +170,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 category=ErrorCategory.MISSING_FIELD,
                 target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR,
             )
+
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         # reformat query and response to the format expected by the prompty flow
         eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
         eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
index 27bb6913d0c6..f4435e969e2e 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
@@ -184,6 +184,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         if "query" not in eval_input and "response" not in eval_input:
             raise EvaluationException(
                 message="Only text conversation inputs are supported.",
@@ -192,6 +198,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 category=ErrorCategory.INVALID_VALUE,
                 target=ErrorTarget.CONVERSATION,
             )
+
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
         if not isinstance(eval_input["query"], str):
             eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
         if not isinstance(eval_input["response"], str):
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py
index daf4534e3058..7f0fa765f9c8 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py
@@ -149,6 +149,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         # we override the _do_eval method as we want the output to be a dictionary,
         # which is a different schema than _base_prompty_eval.py
         if "ground_truth" not in eval_input or "response" not in eval_input:
@@ -161,6 +167,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 target=ErrorTarget.COMPLETENESS_EVALUATOR,
             )
 
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("ground_truth"), list):
+            eval_input["ground_truth"] = _preprocess_messages(eval_input["ground_truth"])
+
         result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         llm_output = result.get("llm_output") if isinstance(result, dict) else result
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py
index eb1f260f1e82..6d20dce9cff4 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py
@@ -153,6 +153,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         # we override the _do_eval method as we want the output to be a dictionary,
         # which is a different schema than _base_prompty_eval.py
         if "query" not in eval_input or "response" not in eval_input:
@@ -164,6 +170,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
                 target=ErrorTarget.TASK_ADHERENCE_EVALUATOR,
             )
 
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         # Reformat conversation history and extract system message
         query_messages = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
         system_message = ""
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py
index d24688f45804..038c62d7ca24 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py
@@ -155,6 +155,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         # we override the _do_eval method as we want the output to be a dictionary,
         # which is a different schema than _base_prompty_eval.py
         if "query" not in eval_input and "response" not in eval_input:
@@ -165,6 +171,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 category=ErrorCategory.MISSING_FIELD,
                 target=ErrorTarget.TASK_COMPLETION_EVALUATOR,
             )
+
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
         eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
         if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py
index 4521996dd384..94046bda5464 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py
@@ -16,7 +16,6 @@
 from azure.ai.evaluation._evaluators._common._validators import ToolDefinitionsValidator, ValidatorInterface
 from azure.ai.evaluation._common._experimental import experimental
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -151,6 +150,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         if "response" not in eval_input:
             raise EvaluationException(
                 message="response is a required input to the Tool Call Success evaluator.",
@@ -168,6 +173,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # t
                 target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
             )
 
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
 
         if "tool_definitions" in eval_input:

From 36d4153c456cfc544b6190a79a5a2a214335313b Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Thu, 12 Feb 2026 17:25:54 +0200
Subject: [PATCH 2/3] add handling in missing evals

---
 .../_tool_call_accuracy.py                    | 21 +++++++++++++++++++
 .../_tool_input_accuracy.py                   | 21 +++++++++++++++++++
 .../_tool_output_utilization.py               | 19 +++++++++++++++++
 .../_tool_selection/_tool_selection.py        | 21 +++++++++++++++++++
 4 files changed, 82 insertions(+)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
index 392a5fe1c86c..1b78edd852c1 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -212,6 +212,24 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+        
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self.threshold,
+                has_details=True,
+            )
+        
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        
         if eval_input.get("query") is None:
             raise EvaluationException(
                 message=("Query is a required input to the Tool Call Accuracy evaluator."),
@@ -221,6 +239,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
             )
 
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         # Single LLM call for all tool calls
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         llm_output = prompty_output_dict.get("llm_output", {})
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
index b792177cb43f..db6f366e3b8c 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
@@ -162,6 +162,24 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
         :return: A dictionary containing the result of the evaluation.
         :rtype: Dict[str, Union[str, float]]
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+        
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                1,
+                has_details=True,
+            )
+        
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        
         if eval_input.get("query") is None:
             raise EvaluationException(
                 message=("Query is a required input to " "the Tool Input Accuracy evaluator."),
@@ -171,6 +189,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
                 target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
             )
 
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         # Format conversation history for cleaner evaluation
         eval_input["query"] = reformat_conversation_history(
             eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py
index 1e4ae3287508..d29360b9752b 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py
@@ -168,6 +168,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+        
         # we override the _do_eval method as we want the output to be a dictionary,
         # which is a different schema than _base_prompty_eval.py
         if ("query" not in eval_input) and ("response" not in eval_input) and ("tool_definitions" not in eval_input):
@@ -178,6 +184,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 category=ErrorCategory.MISSING_FIELD,
                 target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR,
             )
+        
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+        
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
 
         tool_definitions = eval_input["tool_definitions"]
         filtered_tool_definitions = filter_to_used_tools(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
index 183f659ba860..3741f02b917a 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
@@ -182,6 +182,24 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
         :return: A dictionary containing the result of the evaluation.
         :rtype: Dict[str, Union[str, float]]
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+        
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                1,
+                has_details=True,
+            )
+        
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        
         if eval_input.get("query") is None:
             raise EvaluationException(
                 message=("Query is a required input to the Tool Selection evaluator."),
@@ -191,6 +209,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
                 target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
             )
 
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         # Format conversation history for cleaner evaluation
         eval_input["query"] = reformat_conversation_history(
             eval_input["query"], logger, include_system_messages=True, include_tool_messages=True

From 2985a152d415452769116cf80290e91b366c0773 Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Thu, 12 Feb 2026 17:27:47 +0200
Subject: [PATCH 3/3] black

---
 .../_evaluators/_tool_call_accuracy/_tool_call_accuracy.py  | 6 +++---
 .../_tool_input_accuracy/_tool_input_accuracy.py            | 6 +++---
 .../_tool_output_utilization/_tool_output_utilization.py    | 6 +++---
 .../_evaluators/_tool_selection/_tool_selection.py          | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
index 1b78edd852c1..58d132e5b80a 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -217,7 +217,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             _is_intermediate_response,
             _preprocess_messages,
         )
-        
+
         # Check for intermediate response
         if _is_intermediate_response(eval_input.get("response")):
             return self._not_applicable_result(
@@ -225,11 +225,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 self.threshold,
                 has_details=True,
             )
-        
+
         # Preprocess messages if they are lists
         if isinstance(eval_input.get("response"), list):
             eval_input["response"] = _preprocess_messages(eval_input["response"])
-        
+
         if eval_input.get("query") is None:
             raise EvaluationException(
                 message=("Query is a required input to the Tool Call Accuracy evaluator."),
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
index db6f366e3b8c..33b8eab33611 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
@@ -167,7 +167,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
             _is_intermediate_response,
             _preprocess_messages,
         )
-        
+
         # Check for intermediate response
         if _is_intermediate_response(eval_input.get("response")):
             return self._not_applicable_result(
@@ -175,11 +175,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
                 1,
                 has_details=True,
             )
-        
+
         # Preprocess messages if they are lists
         if isinstance(eval_input.get("response"), list):
             eval_input["response"] = _preprocess_messages(eval_input["response"])
-        
+
         if eval_input.get("query") is None:
             raise EvaluationException(
                 message=("Query is a required input to " "the Tool Input Accuracy evaluator."),
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py
index d29360b9752b..ae33b9d6c396 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py
@@ -173,7 +173,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             _is_intermediate_response,
             _preprocess_messages,
         )
-        
+
         # we override the _do_eval method as we want the output to be a dictionary,
         # which is a different schema than _base_prompty_eval.py
         if ("query" not in eval_input) and ("response" not in eval_input) and ("tool_definitions" not in eval_input):
@@ -184,14 +184,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 category=ErrorCategory.MISSING_FIELD,
                 target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR,
             )
-        
+
         # Check for intermediate response
         if _is_intermediate_response(eval_input.get("response")):
             return self._not_applicable_result(
                 "Intermediate response. Please provide the agent's final response for evaluation.",
                 self._threshold,
             )
-        
+
         # Preprocess messages if they are lists
         if isinstance(eval_input.get("response"), list):
             eval_input["response"] = _preprocess_messages(eval_input["response"])
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
index 3741f02b917a..61f3367cc0cd 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
@@ -187,7 +187,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
             _is_intermediate_response,
             _preprocess_messages,
         )
-        
+
         # Check for intermediate response
         if _is_intermediate_response(eval_input.get("response")):
             return self._not_applicable_result(
@@ -195,11 +195,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
                 1,
                 has_details=True,
             )
-        
+
         # Preprocess messages if they are lists
         if isinstance(eval_input.get("response"), list):
             eval_input["response"] = _preprocess_messages(eval_input["response"])
-        
+
         if eval_input.get("query") is None:
             raise EvaluationException(
                 message=("Query is a required input to the Tool Selection evaluator."),