diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index ffe66ff6be98..5d83e1e4c0e8 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -5,7 +5,11 @@ from typing import Dict, List, Optional, Union, Any, Tuple from typing_extensions import overload, override -from azure.ai.evaluation._legacy.prompty import AsyncPrompty + +if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true": + from promptflow.core._flow import AsyncPrompty +else: + from azure.ai.evaluation._legacy.prompty import AsyncPrompty from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase from azure.ai.evaluation._evaluators._common._validators import ConversationValidator, ValidatorInterface diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py index fa75f3f3b892..7d05e7877d79 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py @@ -169,8 +169,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t eval_input["response"] = reformat_agent_response(eval_input["response"], logger) prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) - llm_output = prompty_output_dict["llm_output"] - # llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway + llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) score = math.nan if isinstance(llm_output, dict): score = llm_output.get("score", math.nan) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index 27bb6913d0c6..4452ab141186 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -197,7 +197,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t if not isinstance(eval_input["response"], str): eval_input["response"] = reformat_agent_response(eval_input["response"], logger) result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) - llm_output = result.get("llm_output") + llm_output = result.get("llm_output", result) score = math.nan if isinstance(llm_output, dict): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py index daf4534e3058..404224e3d78c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py @@ -162,7 +162,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t ) result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) - llm_output = result.get("llm_output") if isinstance(result, dict) else result + llm_output = result.get("llm_output", result) if isinstance(result, dict) else result score = math.nan llm_output_is_dict = isinstance(llm_output, dict) @@ -176,19 +176,27 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t binary_result = self._get_binary_result(score) + input_token_count = result.get("input_token_count", 0) if isinstance(result, dict) else 0 + output_token_count = result.get("output_token_count", 0) if isinstance(result, dict) else 0 + total_token_count = result.get("total_token_count", 0) if isinstance(result, dict) else 0 + finish_reason = result.get("finish_reason", "") if isinstance(result, dict) else "" + model_id = result.get("model_id", "") if isinstance(result, dict) else "" + sample_input = result.get("sample_input", "") if isinstance(result, dict) else "" + sample_output = result.get("sample_output", "") if isinstance(result, dict) else "" + # updating the result key and threshold to int based on the schema return { f"{self._result_key}": int(score), f"{self._result_key}_result": binary_result, f"{self._result_key}_threshold": int(self._threshold), f"{self._result_key}_reason": reason, - f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0), - f"{self._result_key}_completion_tokens": result.get("output_token_count", 0), - f"{self._result_key}_total_tokens": result.get("total_token_count", 0), - f"{self._result_key}_finish_reason": result.get("finish_reason", ""), - f"{self._result_key}_model": result.get("model_id", ""), - f"{self._result_key}_sample_input": result.get("sample_input", ""), - f"{self._result_key}_sample_output": result.get("sample_output", ""), + f"{self._result_key}_prompt_tokens": input_token_count, + f"{self._result_key}_completion_tokens": output_token_count, + f"{self._result_key}_total_tokens": total_token_count, + f"{self._result_key}_finish_reason": finish_reason, + f"{self._result_key}_model": model_id, + f"{self._result_key}_sample_input": sample_input, + f"{self._result_key}_sample_output": sample_output, } if logger: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py index f17bab27ab5d..7c608e9e148c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py @@ -8,6 +8,7 @@ from typing_extensions import overload, override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase +from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget class SimilarityEvaluator(PromptyEvaluatorBase): @@ -134,3 +135,37 @@ def __call__( # pylint: disable=docstring-missing-param :rtype: Dict[str, float] """ return super().__call__(*args, **kwargs) + + @override + def _convert_kwargs_to_eval_input(self, **kwargs): + """Convert keyword arguments to evaluation input, with validation.""" + query = kwargs.get("query") + response = kwargs.get("response") + ground_truth = kwargs.get("ground_truth") + + # Validate required fields are not None + if query is None: + raise EvaluationException( + message="SimilarityEvaluator: 'query' is a required input and cannot be None.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.MISSING_FIELD, + target=ErrorTarget.SIMILARITY_EVALUATOR, + ) + + if response is None: + raise EvaluationException( + message="SimilarityEvaluator: 'response' is a required input and cannot be None.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.MISSING_FIELD, + target=ErrorTarget.SIMILARITY_EVALUATOR, + ) + + if ground_truth is None: + raise EvaluationException( + message="SimilarityEvaluator: 'ground_truth' is a required input and cannot be None.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.MISSING_FIELD, + target=ErrorTarget.SIMILARITY_EVALUATOR, + ) + + return super()._convert_kwargs_to_eval_input(**kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py index eb1f260f1e82..deb89478ee51 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py @@ -218,7 +218,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]] } prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_input) - llm_output = prompty_output_dict["llm_output"] + llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) if isinstance(llm_output, dict): flagged = llm_output.get("flagged", False) @@ -230,6 +230,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]] return { f"{self._result_key}": score, f"{self._result_key}_result": score_result, + f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_reason": reasoning, f"{self._result_key}_details": llm_output.get("details", ""), f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0), diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py index d24688f45804..5291b658ee80 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py @@ -167,11 +167,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t ) eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True) eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True) - if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None: + if "tool_definitions" in eval_input and eval_input["tool_definitions"]: eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger) prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) - llm_output = prompty_output_dict.get("llm_output", {}) + llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) if isinstance(llm_output, dict): success_value = llm_output.get("success", False) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py index 828530275872..a51f7b2c8b00 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py @@ -79,7 +79,7 @@ class _TaskNavigationEfficiencyEvaluator(EvaluatorBase): {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "call_tool_B", "arguments": {}}]}, {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "response_synthesis", "arguments": {}}]} ], - ground_truth=["identify_tools_to_call", ""call_tool_A", "call_tool_B", "response_synthesis"] + ground_truth=["identify_tools_to_call", "call_tool_A", "call_tool_B", "response_synthesis"] ) # Example 2: Using tool names with parameters (exact parameter matching required) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 392a5fe1c86c..186a863a229a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -223,7 +223,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t # Single LLM call for all tool calls prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) - llm_output = prompty_output_dict.get("llm_output", {}) + llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) if isinstance(llm_output, dict): score = llm_output.get(self._LLM_SCORE_KEY, None) if not score or not check_score_is_valid( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 4521996dd384..b4ef6edaa484 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -180,7 +180,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t eval_input["tool_definitions"] = _reformat_tool_definitions(filtered_tool_definitions, logger) prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) - llm_output = prompty_output_dict.get("llm_output", "") + llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) if isinstance(llm_output, dict): success = llm_output.get("success", False) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index b792177cb43f..430163e018ed 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -178,7 +178,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # Call the LLM to evaluate prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) - llm_output = prompty_output_dict.get("llm_output", {}) + llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) if isinstance(llm_output, dict): result = llm_output.get("result", None) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py index 1e4ae3287508..a1875f52d095 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py @@ -196,7 +196,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True) prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) - llm_output = prompty_output_dict.get("llm_output", "") + llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) if isinstance(llm_output, dict): output_label = llm_output.get("label", None) if output_label is None: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index 183f659ba860..7e1c8ce74a17 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -90,7 +90,7 @@ def __init__(self, model_config, *, threshold=1, credential=None, **kwargs): model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, - threshold=1, + threshold=threshold, credential=credential, **kwargs, ) @@ -198,7 +198,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # Call the LLM to evaluate prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) - llm_output = prompty_output_dict.get("llm_output", {}) + llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) if isinstance(llm_output, dict): score = llm_output.get("score", None)