diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py index 3a985afbd42e..dbe6d017aa0b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py @@ -183,12 +183,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t } binary_result = self._get_binary_result(score) - return { - self._result_key: float(score), - f"gpt_{self._result_key}": float(score), - f"{self._result_key}_result": binary_result, - f"{self._result_key}_threshold": self._threshold, - } + raise EvaluationException( + message="Evaluator returned invalid output.", + blame=ErrorBlame.SYSTEM_ERROR, + category=ErrorCategory.FAILED_EXECUTION, + target=ErrorTarget.EVALUATE, + ) @staticmethod def _get_built_in_tool_definition(tool_name: str): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py index fa75f3f3b892..8d2c7969a24f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py @@ -204,14 +204,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""), } return response_dict - # If llm_output is not a dictionary, return NaN for the score. This should never happen - if logger: - logger.warning("LLM output is not a dictionary, returning NaN for the score.") - - binary_result = self._get_binary_result(score) - return { - self._result_key: float(score), - f"gpt_{self._result_key}": float(score), - f"{self._result_key}_result": binary_result, - f"{self._result_key}_threshold": self._threshold, - } + # If llm_output is not a dictionary, raise exception + raise EvaluationException( + message="Evaluator returned invalid output.", + blame=ErrorBlame.SYSTEM_ERROR, + category=ErrorCategory.FAILED_EXECUTION, + target=ErrorTarget.EVALUATE, + ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index 27bb6913d0c6..a9c3870e7a4e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -220,12 +220,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_sample_output": result.get("sample_output", ""), } - if logger: - logger.warning("LLM output is not a dictionary, returning NaN for the score.") - - binary_result = self._get_binary_result(score) - return { - self._result_key: float(score), - f"{self._result_key}_result": binary_result, - f"{self._result_key}_threshold": self._threshold, - } + raise EvaluationException( + message="Evaluator returned invalid output.", + blame=ErrorBlame.SYSTEM_ERROR, + category=ErrorCategory.FAILED_EXECUTION, + target=ErrorTarget.EVALUATE, + ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py index daf4534e3058..719011a1c881 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py @@ -191,12 +191,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_sample_output": result.get("sample_output", ""), } - if logger: - logger.warning("LLM output is not a dictionary, returning NaN for the score.") - - binary_result = self._get_binary_result(score) - return { - self._result_key: float(score), - f"{self._result_key}_result": binary_result, - f"{self._result_key}_threshold": self._threshold, - } + raise EvaluationException( + message="Evaluator returned invalid output.", + blame=ErrorBlame.SYSTEM_ERROR, + category=ErrorCategory.FAILED_EXECUTION, + target=ErrorTarget.EVALUATE, + ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py index eb1f260f1e82..c3ae0aa0c303 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py @@ -241,7 +241,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]] f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""), } - if logger: - logger.warning("LLM output is not a dictionary, returning 0 for the success.") - - return {self._result_key: 0} + raise EvaluationException( + message="Evaluator returned invalid output.", + blame=ErrorBlame.SYSTEM_ERROR, + category=ErrorCategory.FAILED_EXECUTION, + target=ErrorTarget.EVALUATE, + ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py index d24688f45804..9af11422b737 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py @@ -194,6 +194,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""), f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""), } - if logger: - logger.warning("LLM output is not a dictionary, returning 0 for the success.") - return {self._result_key: 0} + raise EvaluationException( + message="Evaluator returned invalid output.", + blame=ErrorBlame.SYSTEM_ERROR, + category=ErrorCategory.FAILED_EXECUTION, + target=ErrorTarget.EVALUATE, + ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 392a5fe1c86c..e172dcef67bd 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -262,10 +262,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t else: raise EvaluationException( - message="Tool call accuracy evaluator returned invalid output.", + message="Evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, category=ErrorCategory.FAILED_EXECUTION, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + target=ErrorTarget.EVALUATE, ) async def _real_call(self, **kwargs): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 4521996dd384..7feefb990b3e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -207,16 +207,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""), f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""), } - if logger: - logger.warning("LLM output is not a dictionary, returning NaN for the score.") - - score = math.nan - binary_result = self._get_binary_result(score) - return { - self._result_key: float(score), - f"{self._result_key}_result": binary_result, - f"{self._result_key}_threshold": self._threshold, - } + raise EvaluationException( + message="Evaluator returned invalid output.", + blame=ErrorBlame.SYSTEM_ERROR, + category=ErrorCategory.FAILED_EXECUTION, + target=ErrorTarget.EVALUATE, + ) def _filter_to_used_tools(tool_definitions, msgs_list, logger=None): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index b792177cb43f..3c2888229206 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -217,10 +217,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: else: raise EvaluationException( - message="Tool input accuracy evaluator returned invalid output.", + message="Evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, category=ErrorCategory.FAILED_EXECUTION, - target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, + target=ErrorTarget.EVALUATE, ) async def _real_call(self, **kwargs): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py index 1e4ae3287508..7f3bff7dc4e9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py @@ -232,13 +232,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""), f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""), } - if logger: - logger.warning("LLM output is not a dictionary, returning NaN for the score.") - - score = math.nan - binary_result = self._get_binary_result(score) - return { - self._result_key: float(score), - f"{self._result_key}_result": binary_result, - f"{self._result_key}_threshold": self._threshold, - } + raise EvaluationException( + message="Evaluator returned invalid output.", + blame=ErrorBlame.SYSTEM_ERROR, + category=ErrorCategory.FAILED_EXECUTION, + target=ErrorTarget.EVALUATE, + ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index 183f659ba860..5cd6ca37a475 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -239,10 +239,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: else: raise EvaluationException( - message="Tool selection evaluator returned invalid output.", + message="Evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, category=ErrorCategory.FAILED_EXECUTION, - target=ErrorTarget.TOOL_SELECTION_EVALUATOR, + target=ErrorTarget.EVALUATE, ) async def _real_call(self, **kwargs):