Azure · salma-elshafey · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026
@@ -34,6 +34,71 @@ def value(self) -> str:
 T = TypeVar("T")
 
 
+def _is_intermediate_response(response):
+    """Check if response is intermediate (last content item is function_call or mcp_approval_request)."""
+    if isinstance(response, list) and len(response) > 0:
+        last_msg = response[-1]
+        if isinstance(last_msg, dict) and last_msg.get("role") == "assistant":
+            content = last_msg.get("content", [])
+            if isinstance(content, list) and len(content) > 0:
+                last_content = content[-1]
+                if isinstance(last_content, dict) and last_content.get("type") in (
+                    "function_call",
+                    "mcp_approval_request",
+                ):
+                    return True
+    return False
+
+
+def _drop_mcp_approval_messages(messages):
+    """Remove MCP approval request/response messages."""
+    if not isinstance(messages, list):
+        return messages
+    return [
+        msg
+        for msg in messages
+        if not (
+            isinstance(msg, dict)
+            and isinstance(msg.get("content"), list)
+            and (
+                (
+                    msg.get("role") == "assistant"
+                    and any(isinstance(c, dict) and c.get("type") == "mcp_approval_request" for c in msg["content"])
+                )
+                or (
+                    msg.get("role") == "tool"
+                    and any(isinstance(c, dict) and c.get("type") == "mcp_approval_response" for c in msg["content"])
+                )
+            )
+        )
+    ]
+
+
+def _normalize_function_call_types(messages):
+    """Normalize function_call/function_call_output types to tool_call/tool_result."""
+    if not isinstance(messages, list):
+        return messages
+    for msg in messages:
+        if isinstance(msg, dict) and isinstance(msg.get("content"), list):
+            for item in msg["content"]:
+                if isinstance(item, dict) and item.get("type") == "function_call":
+                    item["type"] = "tool_call"
+                    if "function_call" in item:
+                        item["tool_call"] = item.pop("function_call")
+                elif isinstance(item, dict) and item.get("type") == "function_call_output":
+                    item["type"] = "tool_result"
+                    if "function_call_output" in item:
+                        item["tool_result"] = item.pop("function_call_output")
+    return messages
+
+
+def _preprocess_messages(messages):
+    """Drop MCP approval messages and normalize function call types."""
+    messages = _drop_mcp_approval_messages(messages)
+    messages = _normalize_function_call_types(messages)
+    return messages
+
+
 class PromptyEvaluatorBase(EvaluatorBase[T]):
     """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
     make use of a prompty file, and return their results as a dictionary, with a single key-value pair
@@ -133,6 +198,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 category=ErrorCategory.INVALID_VALUE,
                 target=ErrorTarget.CONVERSATION,
             )
+
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         # Call the prompty flow to get the evaluation result.
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
 
@@ -323,22 +402,36 @@ def _extract_needed_tool_definitions(
         return needed_tool_definitions
 
     def _not_applicable_result(
-        self, error_message: str, threshold: Union[int, float]
+        self, error_message: str, threshold: Union[int, float], has_details: bool = False
     ) -> Dict[str, Union[str, float, Dict]]:
         """Return a result indicating that the evaluation is not applicable.
 
         :param error_message: The error message explaining why evaluation is not applicable.
         :type error_message: str
         :param threshold: The threshold value for the evaluator.
         :type threshold: Union[int, float]
+        :param has_details: Whether to include an empty details field in the result.
+        :type has_details: bool
         :return: A dictionary containing the result of the evaluation.
         :rtype: Dict[str, Union[str, float, Dict]]
         """
         # If no tool calls were made or tool call type is not supported, return not applicable result
-        return {
-            self._result_key: self._NOT_APPLICABLE_RESULT,
+        result = {
+            self._result_key: threshold,
             f"{self._result_key}_result": "pass",
             f"{self._result_key}_threshold": threshold,
-            f"{self._result_key}_reason": error_message,
-            f"{self._result_key}_details": {},
+            f"{self._result_key}_reason": f"Not applicable: {error_message}",
+            f"{self._result_key}_prompt_tokens": 0,
+            f"{self._result_key}_completion_tokens": 0,
+            f"{self._result_key}_total_tokens": 0,
+            f"{self._result_key}_finish_reason": "",
+            f"{self._result_key}_model": "",
+            f"{self._result_key}_sample_input": "",
+            f"{self._result_key}_sample_output": "",
         }
+
+        # Add empty details field if requested
+        if has_details:
+            result[f"{self._result_key}_details"] = {}
+
+        return result
@@ -155,6 +155,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         # we override the _do_eval method as we want the output to be a dictionary, which is a different schema than _base_prompty_eval.py
         if "query" not in eval_input and "response" not in eval_input:
             raise EvaluationException(
@@ -164,6 +170,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 category=ErrorCategory.MISSING_FIELD,
                 target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR,
             )
+
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         # reformat query and response to the format expected by the prompty flow
         eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
         eval_input["response"] = reformat_agent_response(eval_input["response"], logger)

@@ -184,6 +184,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         if "query" not in eval_input and "response" not in eval_input:
             raise EvaluationException(
                 message="Only text conversation inputs are supported.",
@@ -192,6 +198,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 category=ErrorCategory.INVALID_VALUE,
                 target=ErrorTarget.CONVERSATION,
             )
+
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
         if not isinstance(eval_input["query"], str):
             eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
         if not isinstance(eval_input["response"], str):

@@ -149,6 +149,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         # we override the _do_eval method as we want the output to be a dictionary,
         # which is a different schema than _base_prompty_eval.py
         if "ground_truth" not in eval_input or "response" not in eval_input:
@@ -161,6 +167,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 target=ErrorTarget.COMPLETENESS_EVALUATOR,
             )
 
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("ground_truth"), list):
+            eval_input["ground_truth"] = _preprocess_messages(eval_input["ground_truth"])
+
         result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         llm_output = result.get("llm_output") if isinstance(result, dict) else result
 

@@ -153,6 +153,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         # we override the _do_eval method as we want the output to be a dictionary,
         # which is a different schema than _base_prompty_eval.py
         if "query" not in eval_input or "response" not in eval_input:
@@ -164,6 +170,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
                 target=ErrorTarget.TASK_ADHERENCE_EVALUATOR,
             )
 
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         # Reformat conversation history and extract system message
         query_messages = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
         system_message = ""

@@ -155,6 +155,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         # we override the _do_eval method as we want the output to be a dictionary,
         # which is a different schema than _base_prompty_eval.py
         if "query" not in eval_input and "response" not in eval_input:
@@ -165,6 +171,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 category=ErrorCategory.MISSING_FIELD,
                 target=ErrorTarget.TASK_COMPLETION_EVALUATOR,
             )
+
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
         eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
         if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:

@@ -212,6 +212,24 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self.threshold,
+                has_details=True,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+
         if eval_input.get("query") is None:
             raise EvaluationException(
                 message=("Query is a required input to the Tool Call Accuracy evaluator."),
@@ -221,6 +239,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
             )
 
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         # Single LLM call for all tool calls
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         llm_output = prompty_output_dict.get("llm_output", {})

@@ -16,7 +16,6 @@
 from azure.ai.evaluation._evaluators._common._validators import ToolDefinitionsValidator, ValidatorInterface
 from azure.ai.evaluation._common._experimental import experimental
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -151,6 +150,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Import helper functions from base class module
+        from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
+            _is_intermediate_response,
+            _preprocess_messages,
+        )
+
         if "response" not in eval_input:
             raise EvaluationException(
                 message="response is a required input to the Tool Call Success evaluator.",
@@ -168,6 +173,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # t
                 target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
             )
 
+        # Check for intermediate response
+        if _is_intermediate_response(eval_input.get("response")):
+            return self._not_applicable_result(
+                "Intermediate response. Please provide the agent's final response for evaluation.",
+                self._threshold,
+            )
+
+        # Preprocess messages if they are lists
+        if isinstance(eval_input.get("response"), list):
+            eval_input["response"] = _preprocess_messages(eval_input["response"])
+        if isinstance(eval_input.get("query"), list):
+            eval_input["query"] = _preprocess_messages(eval_input["query"])
+
         eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
 
         if "tool_definitions" in eval_input: