diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py index 3a985afbd42e..59e2a2c9a763 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py @@ -34,6 +34,71 @@ def value(self) -> str: T = TypeVar("T") +def _is_intermediate_response(response): + """Check if response is intermediate (last content item is function_call or mcp_approval_request).""" + if isinstance(response, list) and len(response) > 0: + last_msg = response[-1] + if isinstance(last_msg, dict) and last_msg.get("role") == "assistant": + content = last_msg.get("content", []) + if isinstance(content, list) and len(content) > 0: + last_content = content[-1] + if isinstance(last_content, dict) and last_content.get("type") in ( + "function_call", + "mcp_approval_request", + ): + return True + return False + + +def _drop_mcp_approval_messages(messages): + """Remove MCP approval request/response messages.""" + if not isinstance(messages, list): + return messages + return [ + msg + for msg in messages + if not ( + isinstance(msg, dict) + and isinstance(msg.get("content"), list) + and ( + ( + msg.get("role") == "assistant" + and any(isinstance(c, dict) and c.get("type") == "mcp_approval_request" for c in msg["content"]) + ) + or ( + msg.get("role") == "tool" + and any(isinstance(c, dict) and c.get("type") == "mcp_approval_response" for c in msg["content"]) + ) + ) + ) + ] + + +def _normalize_function_call_types(messages): + """Normalize function_call/function_call_output types to tool_call/tool_result.""" + if not isinstance(messages, list): + return messages + for msg in messages: + if isinstance(msg, dict) and isinstance(msg.get("content"), list): + for item in msg["content"]: + if isinstance(item, dict) and item.get("type") == "function_call": + item["type"] = "tool_call" + if "function_call" in item: + item["tool_call"] = item.pop("function_call") + elif isinstance(item, dict) and item.get("type") == "function_call_output": + item["type"] = "tool_result" + if "function_call_output" in item: + item["tool_result"] = item.pop("function_call_output") + return messages + + +def _preprocess_messages(messages): + """Drop MCP approval messages and normalize function call types.""" + messages = _drop_mcp_approval_messages(messages) + messages = _normalize_function_call_types(messages) + return messages + + class PromptyEvaluatorBase(EvaluatorBase[T]): """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators make use of a prompty file, and return their results as a dictionary, with a single key-value pair @@ -133,6 +198,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t category=ErrorCategory.INVALID_VALUE, target=ErrorTarget.CONVERSATION, ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + # Call the prompty flow to get the evaluation result. prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) @@ -323,7 +402,7 @@ def _extract_needed_tool_definitions( return needed_tool_definitions def _not_applicable_result( - self, error_message: str, threshold: Union[int, float] + self, error_message: str, threshold: Union[int, float], has_details: bool = False ) -> Dict[str, Union[str, float, Dict]]: """Return a result indicating that the evaluation is not applicable. @@ -331,14 +410,28 @@ def _not_applicable_result( :type error_message: str :param threshold: The threshold value for the evaluator. :type threshold: Union[int, float] + :param has_details: Whether to include an empty details field in the result. + :type has_details: bool :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, Dict]] """ # If no tool calls were made or tool call type is not supported, return not applicable result - return { - self._result_key: self._NOT_APPLICABLE_RESULT, + result = { + self._result_key: threshold, f"{self._result_key}_result": "pass", f"{self._result_key}_threshold": threshold, - f"{self._result_key}_reason": error_message, - f"{self._result_key}_details": {}, + f"{self._result_key}_reason": f"Not applicable: {error_message}", + f"{self._result_key}_prompt_tokens": 0, + f"{self._result_key}_completion_tokens": 0, + f"{self._result_key}_total_tokens": 0, + f"{self._result_key}_finish_reason": "", + f"{self._result_key}_model": "", + f"{self._result_key}_sample_input": "", + f"{self._result_key}_sample_output": "", } + + # Add empty details field if requested + if has_details: + result[f"{self._result_key}_details"] = {} + + return result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py index fa75f3f3b892..56f3207d0386 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py @@ -155,6 +155,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + # we override the _do_eval method as we want the output to be a dictionary, which is a different schema than _base_prompty_eval.py if "query" not in eval_input and "response" not in eval_input: raise EvaluationException( @@ -164,6 +170,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t category=ErrorCategory.MISSING_FIELD, target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR, ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + # reformat query and response to the format expected by the prompty flow eval_input["query"] = reformat_conversation_history(eval_input["query"], logger) eval_input["response"] = reformat_agent_response(eval_input["response"], logger) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index 27bb6913d0c6..f4435e969e2e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -184,6 +184,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + if "query" not in eval_input and "response" not in eval_input: raise EvaluationException( message="Only text conversation inputs are supported.", @@ -192,6 +198,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t category=ErrorCategory.INVALID_VALUE, target=ErrorTarget.CONVERSATION, ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) if not isinstance(eval_input["query"], str): eval_input["query"] = reformat_conversation_history(eval_input["query"], logger) if not isinstance(eval_input["response"], str): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py index daf4534e3058..7f0fa765f9c8 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py @@ -149,6 +149,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + # we override the _do_eval method as we want the output to be a dictionary, # which is a different schema than _base_prompty_eval.py if "ground_truth" not in eval_input or "response" not in eval_input: @@ -161,6 +167,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t target=ErrorTarget.COMPLETENESS_EVALUATOR, ) + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("ground_truth"), list): + eval_input["ground_truth"] = _preprocess_messages(eval_input["ground_truth"]) + result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) llm_output = result.get("llm_output") if isinstance(result, dict) else result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py index eb1f260f1e82..6d20dce9cff4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py @@ -153,6 +153,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]] :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + # we override the _do_eval method as we want the output to be a dictionary, # which is a different schema than _base_prompty_eval.py if "query" not in eval_input or "response" not in eval_input: @@ -164,6 +170,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]] target=ErrorTarget.TASK_ADHERENCE_EVALUATOR, ) + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + # Reformat conversation history and extract system message query_messages = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True) system_message = "" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py index d24688f45804..038c62d7ca24 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py @@ -155,6 +155,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + # we override the _do_eval method as we want the output to be a dictionary, # which is a different schema than _base_prompty_eval.py if "query" not in eval_input and "response" not in eval_input: @@ -165,6 +171,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t category=ErrorCategory.MISSING_FIELD, target=ErrorTarget.TASK_COMPLETION_EVALUATOR, ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True) eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True) if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 392a5fe1c86c..58d132e5b80a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -212,6 +212,24 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self.threshold, + has_details=True, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to the Tool Call Accuracy evaluator."), @@ -221,6 +239,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, ) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + # Single LLM call for all tool calls prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) llm_output = prompty_output_dict.get("llm_output", {}) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 4521996dd384..94046bda5464 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -16,7 +16,6 @@ from azure.ai.evaluation._evaluators._common._validators import ToolDefinitionsValidator, ValidatorInterface from azure.ai.evaluation._common._experimental import experimental - logger = logging.getLogger(__name__) @@ -151,6 +150,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + if "response" not in eval_input: raise EvaluationException( message="response is a required input to the Tool Call Success evaluator.", @@ -168,6 +173,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, ) + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger) if "tool_definitions" in eval_input: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index b792177cb43f..33b8eab33611 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -162,6 +162,24 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float]] """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + 1, + has_details=True, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to " "the Tool Input Accuracy evaluator."), @@ -171,6 +189,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, ) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + # Format conversation history for cleaner evaluation eval_input["query"] = reformat_conversation_history( eval_input["query"], logger, include_system_messages=True, include_tool_messages=True diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py index 1e4ae3287508..ae33b9d6c396 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py @@ -168,6 +168,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + # we override the _do_eval method as we want the output to be a dictionary, # which is a different schema than _base_prompty_eval.py if ("query" not in eval_input) and ("response" not in eval_input) and ("tool_definitions" not in eval_input): @@ -179,6 +185,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, ) + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + self._threshold, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + tool_definitions = eval_input["tool_definitions"] filtered_tool_definitions = filter_to_used_tools( tool_definitions=tool_definitions, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index 183f659ba860..61f3367cc0cd 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -182,6 +182,24 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float]] """ + # Import helper functions from base class module + from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + _is_intermediate_response, + _preprocess_messages, + ) + + # Check for intermediate response + if _is_intermediate_response(eval_input.get("response")): + return self._not_applicable_result( + "Intermediate response. Please provide the agent's final response for evaluation.", + 1, + has_details=True, + ) + + # Preprocess messages if they are lists + if isinstance(eval_input.get("response"), list): + eval_input["response"] = _preprocess_messages(eval_input["response"]) + if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to the Tool Selection evaluator."), @@ -191,6 +209,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: target=ErrorTarget.TOOL_SELECTION_EVALUATOR, ) + if isinstance(eval_input.get("query"), list): + eval_input["query"] = _preprocess_messages(eval_input["query"]) + # Format conversation history for cleaner evaluation eval_input["query"] = reformat_conversation_history( eval_input["query"], logger, include_system_messages=True, include_tool_messages=True