From 589b283b37a1f64e2728d2371b5da8ff7186f729 Mon Sep 17 00:00:00 2001 From: mohessie Date: Thu, 12 Feb 2026 16:45:58 +0200 Subject: [PATCH] [Agentic Evaluators]: Accept input string as is --- .../_groundedness/_groundedness.py | 5 ++ .../_tool_call_accuracy.py | 37 +++++++------ .../_tool_call_success/_tool_call_success.py | 27 ++++++---- .../_tool_input_accuracy.py | 53 ++++++++++++------- .../_tool_output_utilization.py | 38 ++++++++----- .../_tool_selection/_tool_selection.py | 41 ++++++++------ 6 files changed, 126 insertions(+), 75 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index ffe66ff6be98..157becd45c48 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -322,6 +322,11 @@ def _convert_kwargs_to_eval_input(self, **kwargs): category=ErrorCategory.MISSING_FIELD, target=ErrorTarget.GROUNDEDNESS_EVALUATOR, ) + + # If response is a string, we can skip the context extraction and just return the eval input + if response and isinstance(response, str): + return super()._convert_kwargs_to_eval_input(query=query, response=response, context=response) + context = self._get_context_from_agent_response(response, tool_definitions) filtered_response = self._filter_file_search_results(response) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 392a5fe1c86c..27321f8d3dfc 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -175,25 +175,32 @@ def _convert_kwargs_to_eval_input(self, **kwargs): tool_calls = parsed_tool_calls if not tool_calls: - return {"error_message": self._NO_TOOL_CALLS_MESSAGE} + # If no tool calls provided and response is string, use response string as tool calls as is + if response and isinstance(response, str): + tool_calls = response + else: + return {"error_message": self._NO_TOOL_CALLS_MESSAGE} - if not isinstance(tool_calls, list): + if not isinstance(tool_calls, list) and not isinstance(tool_calls, str): tool_calls = [tool_calls] - if not isinstance(tool_definitions, list): + if not isinstance(tool_definitions, list) and not isinstance(tool_definitions, str): tool_definitions = [tool_definitions] if tool_definitions else [] - try: - needed_tool_definitions = self._extract_needed_tool_definitions( - tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR - ) - except EvaluationException as e: - # Check if this is because no tool definitions were provided at all - if len(tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} - else: - return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} - - if len(needed_tool_definitions) == 0: + if isinstance(tool_calls, str) or isinstance(tool_definitions, str): + needed_tool_definitions = tool_definitions + else: + try: + needed_tool_definitions = self._extract_needed_tool_definitions( + tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR + ) + except EvaluationException as e: + # Check if this is because no tool definitions were provided at all + if len(tool_definitions) == 0: + return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} + else: + return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} + + if not needed_tool_definitions: return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} return { diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 4521996dd384..c7523e312d52 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -168,16 +168,25 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, ) - eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger) - - if "tool_definitions" in eval_input: + # If response is a string, pass directly without reformatting + if isinstance(eval_input["response"], str): + # Unless tool calls are explicitly provided, then keep it as is + if "tool_calls" not in eval_input or not eval_input["tool_calls"]: + eval_input["tool_calls"] = eval_input["response"] + else: + eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger) + + # If tool definitions are string, pass directly without reformatting, else format it. + if "tool_definitions" in eval_input and not isinstance(eval_input["tool_definitions"], str): tool_definitions = eval_input["tool_definitions"] - filtered_tool_definitions = _filter_to_used_tools( - tool_definitions=tool_definitions, - msgs_list=eval_input["response"], - logger=logger, - ) - eval_input["tool_definitions"] = _reformat_tool_definitions(filtered_tool_definitions, logger) + # Only if response is not a string, we filter tool definitions to only tools needed. + if not isinstance(eval_input["response"], str): + tool_definitions = _filter_to_used_tools( + tool_definitions=tool_definitions, + msgs_list=eval_input["response"], + logger=logger, + ) + eval_input["tool_definitions"] = _reformat_tool_definitions(tool_definitions, logger) prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) llm_output = prompty_output_dict.get("llm_output", "") diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index b792177cb43f..b39cd9da1050 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -115,37 +115,50 @@ def _convert_kwargs_to_eval_input(self, **kwargs): query = kwargs.get("query") response = kwargs.get("response") - # Extract tool calls from response if not response: return {"error_message": "Response parameter is required to extract tool calls."} + # Try to parse tool calls from response tool_calls = self._parse_tools_from_response(response) + if not tool_calls: - return {"error_message": self._NO_TOOL_CALLS_MESSAGE} + # If no tool calls found and response is string, use response string as tool calls as is + if isinstance(response, str): + tool_calls = response + else: + return {"error_message": self._NO_TOOL_CALLS_MESSAGE} - if not isinstance(tool_calls, list): + # Normalize tool_calls and tool_definitions (skip for strings) + if not isinstance(tool_calls, list) and not isinstance(tool_calls, str): tool_calls = [tool_calls] - if not isinstance(tool_definitions, list): + if not isinstance(tool_definitions, list) and not isinstance(tool_definitions, str): tool_definitions = [tool_definitions] if tool_definitions else [] - try: - # Type cast to satisfy static type checker - tool_calls_typed = cast(List[Dict], tool_calls) - needed_tool_definitions = self._extract_needed_tool_definitions( - tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR - ) - except EvaluationException as e: - # Check if this is because no tool definitions were provided at all - if len(tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} - else: - return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} - - if len(needed_tool_definitions) == 0: + # Cross-validation (skip when either is string) + if isinstance(tool_calls, str) or isinstance(tool_definitions, str): + needed_tool_definitions = tool_definitions + else: + try: + # Type cast to satisfy static type checker + tool_calls_typed = cast(List[Dict], tool_calls) + needed_tool_definitions = self._extract_needed_tool_definitions( + tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR + ) + except EvaluationException: + # Check if this is because no tool definitions were provided at all + if len(tool_definitions) == 0: + return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} + else: + return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} + + if not needed_tool_definitions: return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} - # Reformat agent response with tool calls and results using reformat_agent_response - agent_response_with_tools = reformat_agent_response(response, include_tool_messages=True) + # Reformat response for LLM (skip for strings - already a string) + if isinstance(tool_calls, str): + agent_response_with_tools = tool_calls + else: + agent_response_with_tools = reformat_agent_response(response, include_tool_messages=True) return { "query": query, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py index 1e4ae3287508..db009cd92cea 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py @@ -179,21 +179,31 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, ) + # If response or tool_definitions are strings, pass directly without reformatting + # Process each parameter individually - strings pass through, dicts get reformatted tool_definitions = eval_input["tool_definitions"] - filtered_tool_definitions = filter_to_used_tools( - tool_definitions=tool_definitions, - msgs_lists=[eval_input["query"], eval_input["response"]], - logger=logger, - ) - eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger) - - eval_input["query"] = reformat_conversation_history( - eval_input["query"], - logger, - include_system_messages=True, - include_tool_messages=True, - ) - eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True) + if not isinstance(tool_definitions, str): + if not isinstance(eval_input.get("query"), str) and not isinstance(eval_input.get("response"), str): + filtered_tool_definitions = filter_to_used_tools( + tool_definitions=tool_definitions, + msgs_lists=[eval_input["query"], eval_input["response"]], + logger=logger, + ) + else: + filtered_tool_definitions = tool_definitions + eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger) + + if not isinstance(eval_input.get("query"), str): + eval_input["query"] = reformat_conversation_history( + eval_input["query"], + logger, + include_system_messages=True, + include_tool_messages=True, + ) + if not isinstance(eval_input.get("response"), str): + eval_input["response"] = reformat_agent_response( + eval_input["response"], logger, include_tool_messages=True + ) prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) llm_output = prompty_output_dict.get("llm_output", "") diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index 183f659ba860..e7698d688f5d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -143,29 +143,36 @@ def _convert_kwargs_to_eval_input(self, **kwargs): tool_calls = parsed_tool_calls if not tool_calls: - return {"error_message": self._NO_TOOL_CALLS_MESSAGE} + # If no tool calls provided and response is string, use response string as tool calls as is + if response and isinstance(response, str): + tool_calls = response + else: + return {"error_message": self._NO_TOOL_CALLS_MESSAGE} - if not isinstance(tool_calls, list): + if not isinstance(tool_calls, list) and not isinstance(tool_calls, str): tool_calls = [tool_calls] - if not isinstance(tool_definitions, list): + if not isinstance(tool_definitions, list) and not isinstance(tool_definitions, str): tool_definitions = [tool_definitions] if tool_definitions else [] - try: - needed_tool_definitions = self._extract_needed_tool_definitions( - tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR - ) - except EvaluationException as e: - # Check if this is because no tool definitions were provided at all - if len(tool_definitions) == 0: - return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} - else: - return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} - - if len(needed_tool_definitions) == 0: + if isinstance(tool_calls, str) or isinstance(tool_definitions, str): + needed_tool_definitions = tool_definitions + else: + try: + needed_tool_definitions = self._extract_needed_tool_definitions( + tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR + ) + except EvaluationException: + # Check if this is because no tool definitions were provided at all + if len(tool_definitions) == 0: + return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} + else: + return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE} + + if not needed_tool_definitions: return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE} - # Extract only tool names from tool calls, removing parameters and results - tool_names = self._extract_tool_names_from_calls(tool_calls) + # Extract only tool names from tool calls, removing parameters and results (skip for strings) + tool_names = tool_calls if isinstance(tool_calls, str) else self._extract_tool_names_from_calls(tool_calls) return { "query": query,