Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,11 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
category=ErrorCategory.MISSING_FIELD,
target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
)

# If response is a string, we can skip the context extraction and just return the eval input
if response and isinstance(response, str):
return super()._convert_kwargs_to_eval_input(query=query, response=response, context=response)

context = self._get_context_from_agent_response(response, tool_definitions)

filtered_response = self._filter_file_search_results(response)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,25 +175,32 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
tool_calls = parsed_tool_calls

if not tool_calls:
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
# If no tool calls provided and response is string, use response string as tool calls as is
if response and isinstance(response, str):
tool_calls = response
else:
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}

if not isinstance(tool_calls, list):
if not isinstance(tool_calls, list) and not isinstance(tool_calls, str):
tool_calls = [tool_calls]
if not isinstance(tool_definitions, list):
if not isinstance(tool_definitions, list) and not isinstance(tool_definitions, str):
tool_definitions = [tool_definitions] if tool_definitions else []

try:
needed_tool_definitions = self._extract_needed_tool_definitions(
tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
)
except EvaluationException as e:
# Check if this is because no tool definitions were provided at all
if len(tool_definitions) == 0:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
else:
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}

if len(needed_tool_definitions) == 0:
if isinstance(tool_calls, str) or isinstance(tool_definitions, str):
needed_tool_definitions = tool_definitions
else:
try:
needed_tool_definitions = self._extract_needed_tool_definitions(
tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
)
except EvaluationException as e:
# Check if this is because no tool definitions were provided at all
if len(tool_definitions) == 0:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
else:
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}

if not needed_tool_definitions:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}

return {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,16 +168,25 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t
target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
)

eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)

if "tool_definitions" in eval_input:
# If response is a string, pass directly without reformatting
if isinstance(eval_input["response"], str):
# Unless tool calls are explicitly provided, then keep it as is
if "tool_calls" not in eval_input or not eval_input["tool_calls"]:
eval_input["tool_calls"] = eval_input["response"]
else:
eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)

# If tool definitions are string, pass directly without reformatting, else format it.
if "tool_definitions" in eval_input and not isinstance(eval_input["tool_definitions"], str):
tool_definitions = eval_input["tool_definitions"]
filtered_tool_definitions = _filter_to_used_tools(
tool_definitions=tool_definitions,
msgs_list=eval_input["response"],
logger=logger,
)
eval_input["tool_definitions"] = _reformat_tool_definitions(filtered_tool_definitions, logger)
# Only if response is not a string, we filter tool definitions to only tools needed.
if not isinstance(eval_input["response"], str):
tool_definitions = _filter_to_used_tools(
tool_definitions=tool_definitions,
msgs_list=eval_input["response"],
logger=logger,
)
eval_input["tool_definitions"] = _reformat_tool_definitions(tool_definitions, logger)

prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,37 +115,50 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
query = kwargs.get("query")
response = kwargs.get("response")

# Extract tool calls from response
if not response:
return {"error_message": "Response parameter is required to extract tool calls."}

# Try to parse tool calls from response
tool_calls = self._parse_tools_from_response(response)

if not tool_calls:
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
# If no tool calls found and response is string, use response string as tool calls as is
if isinstance(response, str):
tool_calls = response
else:
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}

if not isinstance(tool_calls, list):
# Normalize tool_calls and tool_definitions (skip for strings)
if not isinstance(tool_calls, list) and not isinstance(tool_calls, str):
tool_calls = [tool_calls]
if not isinstance(tool_definitions, list):
if not isinstance(tool_definitions, list) and not isinstance(tool_definitions, str):
tool_definitions = [tool_definitions] if tool_definitions else []

try:
# Type cast to satisfy static type checker
tool_calls_typed = cast(List[Dict], tool_calls)
needed_tool_definitions = self._extract_needed_tool_definitions(
tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR
)
except EvaluationException as e:
# Check if this is because no tool definitions were provided at all
if len(tool_definitions) == 0:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
else:
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}

if len(needed_tool_definitions) == 0:
# Cross-validation (skip when either is string)
if isinstance(tool_calls, str) or isinstance(tool_definitions, str):
needed_tool_definitions = tool_definitions
else:
try:
# Type cast to satisfy static type checker
tool_calls_typed = cast(List[Dict], tool_calls)
needed_tool_definitions = self._extract_needed_tool_definitions(
tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR
)
except EvaluationException:
# Check if this is because no tool definitions were provided at all
if len(tool_definitions) == 0:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
else:
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}

if not needed_tool_definitions:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}

# Reformat agent response with tool calls and results using reformat_agent_response
agent_response_with_tools = reformat_agent_response(response, include_tool_messages=True)
# Reformat response for LLM (skip for strings - already a string)
if isinstance(tool_calls, str):
agent_response_with_tools = tool_calls
else:
agent_response_with_tools = reformat_agent_response(response, include_tool_messages=True)

return {
"query": query,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,21 +179,31 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR,
)

# If response or tool_definitions are strings, pass directly without reformatting
# Process each parameter individually - strings pass through, dicts get reformatted
tool_definitions = eval_input["tool_definitions"]
filtered_tool_definitions = filter_to_used_tools(
tool_definitions=tool_definitions,
msgs_lists=[eval_input["query"], eval_input["response"]],
logger=logger,
)
eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger)

eval_input["query"] = reformat_conversation_history(
eval_input["query"],
logger,
include_system_messages=True,
include_tool_messages=True,
)
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
if not isinstance(tool_definitions, str):
if not isinstance(eval_input.get("query"), str) and not isinstance(eval_input.get("response"), str):
filtered_tool_definitions = filter_to_used_tools(
tool_definitions=tool_definitions,
msgs_lists=[eval_input["query"], eval_input["response"]],
logger=logger,
)
else:
filtered_tool_definitions = tool_definitions
eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger)

if not isinstance(eval_input.get("query"), str):
eval_input["query"] = reformat_conversation_history(
eval_input["query"],
logger,
include_system_messages=True,
include_tool_messages=True,
)
if not isinstance(eval_input.get("response"), str):
eval_input["response"] = reformat_agent_response(
eval_input["response"], logger, include_tool_messages=True
)

prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,29 +143,36 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
tool_calls = parsed_tool_calls

if not tool_calls:
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
# If no tool calls provided and response is string, use response string as tool calls as is
if response and isinstance(response, str):
tool_calls = response
else:
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}

if not isinstance(tool_calls, list):
if not isinstance(tool_calls, list) and not isinstance(tool_calls, str):
tool_calls = [tool_calls]
if not isinstance(tool_definitions, list):
if not isinstance(tool_definitions, list) and not isinstance(tool_definitions, str):
tool_definitions = [tool_definitions] if tool_definitions else []

try:
needed_tool_definitions = self._extract_needed_tool_definitions(
tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR
)
except EvaluationException as e:
# Check if this is because no tool definitions were provided at all
if len(tool_definitions) == 0:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
else:
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}

if len(needed_tool_definitions) == 0:
if isinstance(tool_calls, str) or isinstance(tool_definitions, str):
needed_tool_definitions = tool_definitions
else:
try:
needed_tool_definitions = self._extract_needed_tool_definitions(
tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR
)
except EvaluationException:
# Check if this is because no tool definitions were provided at all
if len(tool_definitions) == 0:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
else:
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}

if not needed_tool_definitions:
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}

# Extract only tool names from tool calls, removing parameters and results
tool_names = self._extract_tool_names_from_calls(tool_calls)
# Extract only tool names from tool calls, removing parameters and results (skip for strings)
tool_names = tool_calls if isinstance(tool_calls, str) else self._extract_tool_names_from_calls(tool_calls)

return {
"query": query,
Expand Down
Loading