Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,71 @@ def value(self) -> str:
T = TypeVar("T")


def _is_intermediate_response(response):
"""Check if response is intermediate (last content item is function_call or mcp_approval_request)."""
if isinstance(response, list) and len(response) > 0:
last_msg = response[-1]
if isinstance(last_msg, dict) and last_msg.get("role") == "assistant":
content = last_msg.get("content", [])
if isinstance(content, list) and len(content) > 0:
last_content = content[-1]
if isinstance(last_content, dict) and last_content.get("type") in (
"function_call",
"mcp_approval_request",
):
return True
return False


def _drop_mcp_approval_messages(messages):
"""Remove MCP approval request/response messages."""
if not isinstance(messages, list):
return messages
return [
msg
for msg in messages
if not (
isinstance(msg, dict)
and isinstance(msg.get("content"), list)
and (
(
msg.get("role") == "assistant"
and any(isinstance(c, dict) and c.get("type") == "mcp_approval_request" for c in msg["content"])
)
or (
msg.get("role") == "tool"
and any(isinstance(c, dict) and c.get("type") == "mcp_approval_response" for c in msg["content"])
)
)
)
]


def _normalize_function_call_types(messages):
"""Normalize function_call/function_call_output types to tool_call/tool_result."""
if not isinstance(messages, list):
return messages
for msg in messages:
if isinstance(msg, dict) and isinstance(msg.get("content"), list):
for item in msg["content"]:
if isinstance(item, dict) and item.get("type") == "function_call":
item["type"] = "tool_call"
if "function_call" in item:
item["tool_call"] = item.pop("function_call")
elif isinstance(item, dict) and item.get("type") == "function_call_output":
item["type"] = "tool_result"
if "function_call_output" in item:
item["tool_result"] = item.pop("function_call_output")
return messages


def _preprocess_messages(messages):
"""Drop MCP approval messages and normalize function call types."""
messages = _drop_mcp_approval_messages(messages)
messages = _normalize_function_call_types(messages)
return messages


class PromptyEvaluatorBase(EvaluatorBase[T]):
"""Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
make use of a prompty file, and return their results as a dictionary, with a single key-value pair
Expand Down Expand Up @@ -133,6 +198,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.CONVERSATION,
)

# Check for intermediate response
if _is_intermediate_response(eval_input.get("response")):
return self._not_applicable_result(
"Intermediate response. Please provide the agent's final response for evaluation.",
self._threshold,
)

# Preprocess messages if they are lists
if isinstance(eval_input.get("response"), list):
eval_input["response"] = _preprocess_messages(eval_input["response"])
if isinstance(eval_input.get("query"), list):
eval_input["query"] = _preprocess_messages(eval_input["query"])

# Call the prompty flow to get the evaluation result.
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)

Expand Down Expand Up @@ -323,22 +402,36 @@ def _extract_needed_tool_definitions(
return needed_tool_definitions

def _not_applicable_result(
self, error_message: str, threshold: Union[int, float]
self, error_message: str, threshold: Union[int, float], has_details: bool = False
) -> Dict[str, Union[str, float, Dict]]:
"""Return a result indicating that the evaluation is not applicable.

:param error_message: The error message explaining why evaluation is not applicable.
:type error_message: str
:param threshold: The threshold value for the evaluator.
:type threshold: Union[int, float]
:param has_details: Whether to include an empty details field in the result.
:type has_details: bool
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float, Dict]]
"""
# If no tool calls were made or tool call type is not supported, return not applicable result
return {
self._result_key: self._NOT_APPLICABLE_RESULT,
result = {
self._result_key: threshold,
f"{self._result_key}_result": "pass",
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_reason": error_message,
f"{self._result_key}_details": {},
f"{self._result_key}_reason": f"Not applicable: {error_message}",
f"{self._result_key}_prompt_tokens": 0,
f"{self._result_key}_completion_tokens": 0,
f"{self._result_key}_total_tokens": 0,
f"{self._result_key}_finish_reason": "",
f"{self._result_key}_model": "",
f"{self._result_key}_sample_input": "",
f"{self._result_key}_sample_output": "",
}

# Add empty details field if requested
if has_details:
result[f"{self._result_key}_details"] = {}

return result
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
:return: The evaluation result.
:rtype: Dict
"""
# Import helper functions from base class module
from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
_is_intermediate_response,
_preprocess_messages,
)

# we override the _do_eval method as we want the output to be a dictionary, which is a different schema than _base_prompty_eval.py
if "query" not in eval_input and "response" not in eval_input:
raise EvaluationException(
Expand All @@ -164,6 +170,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
category=ErrorCategory.MISSING_FIELD,
target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR,
)

# Check for intermediate response
if _is_intermediate_response(eval_input.get("response")):
return self._not_applicable_result(
"Intermediate response. Please provide the agent's final response for evaluation.",
self._threshold,
)

# Preprocess messages if they are lists
if isinstance(eval_input.get("response"), list):
eval_input["response"] = _preprocess_messages(eval_input["response"])
if isinstance(eval_input.get("query"), list):
eval_input["query"] = _preprocess_messages(eval_input["query"])

# reformat query and response to the format expected by the prompty flow
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
:return: The evaluation result.
:rtype: Dict
"""
# Import helper functions from base class module
from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
_is_intermediate_response,
_preprocess_messages,
)

if "query" not in eval_input and "response" not in eval_input:
raise EvaluationException(
message="Only text conversation inputs are supported.",
Expand All @@ -192,6 +198,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.CONVERSATION,
)

# Check for intermediate response
if _is_intermediate_response(eval_input.get("response")):
return self._not_applicable_result(
"Intermediate response. Please provide the agent's final response for evaluation.",
self._threshold,
)

# Preprocess messages if they are lists
if isinstance(eval_input.get("response"), list):
eval_input["response"] = _preprocess_messages(eval_input["response"])
if isinstance(eval_input.get("query"), list):
eval_input["query"] = _preprocess_messages(eval_input["query"])
if not isinstance(eval_input["query"], str):
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
if not isinstance(eval_input["response"], str):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
:return: The evaluation result.
:rtype: Dict
"""
# Import helper functions from base class module
from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
_is_intermediate_response,
_preprocess_messages,
)

# we override the _do_eval method as we want the output to be a dictionary,
# which is a different schema than _base_prompty_eval.py
if "ground_truth" not in eval_input or "response" not in eval_input:
Expand All @@ -161,6 +167,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
target=ErrorTarget.COMPLETENESS_EVALUATOR,
)

# Check for intermediate response
if _is_intermediate_response(eval_input.get("response")):
return self._not_applicable_result(
"Intermediate response. Please provide the agent's final response for evaluation.",
self._threshold,
)

# Preprocess messages if they are lists
if isinstance(eval_input.get("response"), list):
eval_input["response"] = _preprocess_messages(eval_input["response"])
if isinstance(eval_input.get("ground_truth"), list):
eval_input["ground_truth"] = _preprocess_messages(eval_input["ground_truth"])

result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = result.get("llm_output") if isinstance(result, dict) else result

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
:return: The evaluation result.
:rtype: Dict
"""
# Import helper functions from base class module
from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
_is_intermediate_response,
_preprocess_messages,
)

# we override the _do_eval method as we want the output to be a dictionary,
# which is a different schema than _base_prompty_eval.py
if "query" not in eval_input or "response" not in eval_input:
Expand All @@ -164,6 +170,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
target=ErrorTarget.TASK_ADHERENCE_EVALUATOR,
)

# Check for intermediate response
if _is_intermediate_response(eval_input.get("response")):
return self._not_applicable_result(
"Intermediate response. Please provide the agent's final response for evaluation.",
self._threshold,
)

# Preprocess messages if they are lists
if isinstance(eval_input.get("response"), list):
eval_input["response"] = _preprocess_messages(eval_input["response"])
if isinstance(eval_input.get("query"), list):
eval_input["query"] = _preprocess_messages(eval_input["query"])

# Reformat conversation history and extract system message
query_messages = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
system_message = ""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
:return: The evaluation result.
:rtype: Dict
"""
# Import helper functions from base class module
from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
_is_intermediate_response,
_preprocess_messages,
)

# we override the _do_eval method as we want the output to be a dictionary,
# which is a different schema than _base_prompty_eval.py
if "query" not in eval_input and "response" not in eval_input:
Expand All @@ -165,6 +171,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
category=ErrorCategory.MISSING_FIELD,
target=ErrorTarget.TASK_COMPLETION_EVALUATOR,
)

# Check for intermediate response
if _is_intermediate_response(eval_input.get("response")):
return self._not_applicable_result(
"Intermediate response. Please provide the agent's final response for evaluation.",
self._threshold,
)

# Preprocess messages if they are lists
if isinstance(eval_input.get("response"), list):
eval_input["response"] = _preprocess_messages(eval_input["response"])
if isinstance(eval_input.get("query"), list):
eval_input["query"] = _preprocess_messages(eval_input["query"])

eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,24 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
:return: The evaluation result.
:rtype: Dict
"""
# Import helper functions from base class module
from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
_is_intermediate_response,
_preprocess_messages,
)

# Check for intermediate response
if _is_intermediate_response(eval_input.get("response")):
return self._not_applicable_result(
"Intermediate response. Please provide the agent's final response for evaluation.",
self.threshold,
has_details=True,
)

# Preprocess messages if they are lists
if isinstance(eval_input.get("response"), list):
eval_input["response"] = _preprocess_messages(eval_input["response"])

if eval_input.get("query") is None:
raise EvaluationException(
message=("Query is a required input to the Tool Call Accuracy evaluator."),
Expand All @@ -221,6 +239,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
)

if isinstance(eval_input.get("query"), list):
eval_input["query"] = _preprocess_messages(eval_input["query"])

# Single LLM call for all tool calls
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", {})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from azure.ai.evaluation._evaluators._common._validators import ToolDefinitionsValidator, ValidatorInterface
from azure.ai.evaluation._common._experimental import experimental


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -151,6 +150,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t
:return: The evaluation result.
:rtype: Dict
"""
# Import helper functions from base class module
from azure.ai.evaluation._evaluators._common._base_prompty_eval import (
_is_intermediate_response,
_preprocess_messages,
)

if "response" not in eval_input:
raise EvaluationException(
message="response is a required input to the Tool Call Success evaluator.",
Expand All @@ -168,6 +173,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t
target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
)

# Check for intermediate response
if _is_intermediate_response(eval_input.get("response")):
return self._not_applicable_result(
"Intermediate response. Please provide the agent's final response for evaluation.",
self._threshold,
)

# Preprocess messages if they are lists
if isinstance(eval_input.get("response"), list):
eval_input["response"] = _preprocess_messages(eval_input["response"])
if isinstance(eval_input.get("query"), list):
eval_input["query"] = _preprocess_messages(eval_input["query"])

eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)

if "tool_definitions" in eval_input:
Expand Down
Loading
Loading