From 589b283b37a1f64e2728d2371b5da8ff7186f729 Mon Sep 17 00:00:00 2001
From: mohessie <mohessie@microsoft.com>
Date: Thu, 12 Feb 2026 16:45:58 +0200
Subject: [PATCH] [Agentic Evaluators]: Accept input string as is

---
 .../_groundedness/_groundedness.py            |  5 ++
 .../_tool_call_accuracy.py                    | 37 +++++++------
 .../_tool_call_success/_tool_call_success.py  | 27 ++++++----
 .../_tool_input_accuracy.py                   | 53 ++++++++++++-------
 .../_tool_output_utilization.py               | 38 ++++++++-----
 .../_tool_selection/_tool_selection.py        | 41 ++++++++------
 6 files changed, 126 insertions(+), 75 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index ffe66ff6be98..157becd45c48 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -322,6 +322,11 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
                 category=ErrorCategory.MISSING_FIELD,
                 target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
             )
+
+        # If response is a string, we can skip the context extraction and just return the eval input
+        if response and isinstance(response, str):
+            return super()._convert_kwargs_to_eval_input(query=query, response=response, context=response)
+
         context = self._get_context_from_agent_response(response, tool_definitions)
 
         filtered_response = self._filter_file_search_results(response)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
index 392a5fe1c86c..27321f8d3dfc 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -175,25 +175,32 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
                 tool_calls = parsed_tool_calls
 
         if not tool_calls:
-            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
+            # If no tool calls provided and response is string, use response string as tool calls as is
+            if response and isinstance(response, str):
+                tool_calls = response
+            else:
+                return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
 
-        if not isinstance(tool_calls, list):
+        if not isinstance(tool_calls, list) and not isinstance(tool_calls, str):
             tool_calls = [tool_calls]
-        if not isinstance(tool_definitions, list):
+        if not isinstance(tool_definitions, list) and not isinstance(tool_definitions, str):
             tool_definitions = [tool_definitions] if tool_definitions else []
 
-        try:
-            needed_tool_definitions = self._extract_needed_tool_definitions(
-                tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
-            )
-        except EvaluationException as e:
-            # Check if this is because no tool definitions were provided at all
-            if len(tool_definitions) == 0:
-                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
-            else:
-                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
-
-        if len(needed_tool_definitions) == 0:
+        if isinstance(tool_calls, str) or isinstance(tool_definitions, str):
+            needed_tool_definitions = tool_definitions
+        else:
+            try:
+                needed_tool_definitions = self._extract_needed_tool_definitions(
+                    tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
+                )
+            except EvaluationException as e:
+                # Check if this is because no tool definitions were provided at all
+                if len(tool_definitions) == 0:
+                    return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+                else:
+                    return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+
+        if not needed_tool_definitions:
             return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
 
         return {
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py
index 4521996dd384..c7523e312d52 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py
@@ -168,16 +168,25 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # t
                 target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
             )
 
-        eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
-
-        if "tool_definitions" in eval_input:
+        # If response is a string, pass directly without reformatting
+        if isinstance(eval_input["response"], str):
+            # Unless tool calls are explicitly provided, then keep it as is
+            if "tool_calls" not in eval_input or not eval_input["tool_calls"]:
+                eval_input["tool_calls"] = eval_input["response"]
+        else:
+            eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
+
+        # If tool definitions are string, pass directly without reformatting, else format it.
+        if "tool_definitions" in eval_input and not isinstance(eval_input["tool_definitions"], str):
             tool_definitions = eval_input["tool_definitions"]
-            filtered_tool_definitions = _filter_to_used_tools(
-                tool_definitions=tool_definitions,
-                msgs_list=eval_input["response"],
-                logger=logger,
-            )
-            eval_input["tool_definitions"] = _reformat_tool_definitions(filtered_tool_definitions, logger)
+            # Only if response is not a string, we filter tool definitions to only tools needed.
+            if not isinstance(eval_input["response"], str):
+                tool_definitions = _filter_to_used_tools(
+                    tool_definitions=tool_definitions,
+                    msgs_list=eval_input["response"],
+                    logger=logger,
+                )
+            eval_input["tool_definitions"] = _reformat_tool_definitions(tool_definitions, logger)
 
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         llm_output = prompty_output_dict.get("llm_output", "")
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
index b792177cb43f..b39cd9da1050 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
@@ -115,37 +115,50 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
         query = kwargs.get("query")
         response = kwargs.get("response")
 
-        # Extract tool calls from response
         if not response:
             return {"error_message": "Response parameter is required to extract tool calls."}
 
+        # Try to parse tool calls from response
         tool_calls = self._parse_tools_from_response(response)
+
         if not tool_calls:
-            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
+            # If no tool calls found and response is string, use response string as tool calls as is
+            if isinstance(response, str):
+                tool_calls = response
+            else:
+                return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
 
-        if not isinstance(tool_calls, list):
+        # Normalize tool_calls and tool_definitions (skip for strings)
+        if not isinstance(tool_calls, list) and not isinstance(tool_calls, str):
             tool_calls = [tool_calls]
-        if not isinstance(tool_definitions, list):
+        if not isinstance(tool_definitions, list) and not isinstance(tool_definitions, str):
             tool_definitions = [tool_definitions] if tool_definitions else []
 
-        try:
-            # Type cast to satisfy static type checker
-            tool_calls_typed = cast(List[Dict], tool_calls)
-            needed_tool_definitions = self._extract_needed_tool_definitions(
-                tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR
-            )
-        except EvaluationException as e:
-            # Check if this is because no tool definitions were provided at all
-            if len(tool_definitions) == 0:
-                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
-            else:
-                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
-
-        if len(needed_tool_definitions) == 0:
+        # Cross-validation (skip when either is string)
+        if isinstance(tool_calls, str) or isinstance(tool_definitions, str):
+            needed_tool_definitions = tool_definitions
+        else:
+            try:
+                # Type cast to satisfy static type checker
+                tool_calls_typed = cast(List[Dict], tool_calls)
+                needed_tool_definitions = self._extract_needed_tool_definitions(
+                    tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR
+                )
+            except EvaluationException:
+                # Check if this is because no tool definitions were provided at all
+                if len(tool_definitions) == 0:
+                    return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+                else:
+                    return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+
+        if not needed_tool_definitions:
             return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
 
-        # Reformat agent response with tool calls and results using reformat_agent_response
-        agent_response_with_tools = reformat_agent_response(response, include_tool_messages=True)
+        # Reformat response for LLM (skip for strings - already a string)
+        if isinstance(tool_calls, str):
+            agent_response_with_tools = tool_calls
+        else:
+            agent_response_with_tools = reformat_agent_response(response, include_tool_messages=True)
 
         return {
             "query": query,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py
index 1e4ae3287508..db009cd92cea 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py
@@ -179,21 +179,31 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR,
             )
 
+        # If response or tool_definitions are strings, pass directly without reformatting
+        # Process each parameter individually - strings pass through, dicts get reformatted
         tool_definitions = eval_input["tool_definitions"]
-        filtered_tool_definitions = filter_to_used_tools(
-            tool_definitions=tool_definitions,
-            msgs_lists=[eval_input["query"], eval_input["response"]],
-            logger=logger,
-        )
-        eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger)
-
-        eval_input["query"] = reformat_conversation_history(
-            eval_input["query"],
-            logger,
-            include_system_messages=True,
-            include_tool_messages=True,
-        )
-        eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
+        if not isinstance(tool_definitions, str):
+            if not isinstance(eval_input.get("query"), str) and not isinstance(eval_input.get("response"), str):
+                filtered_tool_definitions = filter_to_used_tools(
+                    tool_definitions=tool_definitions,
+                    msgs_lists=[eval_input["query"], eval_input["response"]],
+                    logger=logger,
+                )
+            else:
+                filtered_tool_definitions = tool_definitions
+            eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger)
+
+        if not isinstance(eval_input.get("query"), str):
+            eval_input["query"] = reformat_conversation_history(
+                eval_input["query"],
+                logger,
+                include_system_messages=True,
+                include_tool_messages=True,
+            )
+        if not isinstance(eval_input.get("response"), str):
+            eval_input["response"] = reformat_agent_response(
+                eval_input["response"], logger, include_tool_messages=True
+            )
 
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         llm_output = prompty_output_dict.get("llm_output", "")
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
index 183f659ba860..e7698d688f5d 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py
@@ -143,29 +143,36 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
                 tool_calls = parsed_tool_calls
 
         if not tool_calls:
-            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
+            # If no tool calls provided and response is string, use response string as tool calls as is
+            if response and isinstance(response, str):
+                tool_calls = response
+            else:
+                return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
 
-        if not isinstance(tool_calls, list):
+        if not isinstance(tool_calls, list) and not isinstance(tool_calls, str):
             tool_calls = [tool_calls]
-        if not isinstance(tool_definitions, list):
+        if not isinstance(tool_definitions, list) and not isinstance(tool_definitions, str):
             tool_definitions = [tool_definitions] if tool_definitions else []
 
-        try:
-            needed_tool_definitions = self._extract_needed_tool_definitions(
-                tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR
-            )
-        except EvaluationException as e:
-            # Check if this is because no tool definitions were provided at all
-            if len(tool_definitions) == 0:
-                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
-            else:
-                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
-
-        if len(needed_tool_definitions) == 0:
+        if isinstance(tool_calls, str) or isinstance(tool_definitions, str):
+            needed_tool_definitions = tool_definitions
+        else:
+            try:
+                needed_tool_definitions = self._extract_needed_tool_definitions(
+                    tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR
+                )
+            except EvaluationException:
+                # Check if this is because no tool definitions were provided at all
+                if len(tool_definitions) == 0:
+                    return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+                else:
+                    return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+
+        if not needed_tool_definitions:
             return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
 
-        # Extract only tool names from tool calls, removing parameters and results
-        tool_names = self._extract_tool_names_from_calls(tool_calls)
+        # Extract only tool names from tool calls, removing parameters and results (skip for strings)
+        tool_names = tool_calls if isinstance(tool_calls, str) else self._extract_tool_names_from_calls(tool_calls)
 
         return {
             "query": query,