diff --git a/api/oss/src/core/evaluators/service.py b/api/oss/src/core/evaluators/service.py index b0a76728f..c8ef56939 100644 --- a/api/oss/src/core/evaluators/service.py +++ b/api/oss/src/core/evaluators/service.py @@ -1,13 +1,9 @@ from typing import Optional, List from uuid import UUID, uuid4 -from json import loads from oss.src.utils.helpers import get_slug_from_name_and_id from oss.src.services.db_manager import fetch_evaluator_config from oss.src.core.workflows.dtos import ( - WorkflowFlags, - WorkflowQueryFlags, - # WorkflowCreate, WorkflowEdit, WorkflowQuery, @@ -17,8 +13,6 @@ WorkflowVariantEdit, WorkflowVariantQuery, # - WorkflowRevisionData, - # WorkflowRevisionCreate, WorkflowRevisionEdit, WorkflowRevisionCommit, @@ -35,11 +29,7 @@ SimpleEvaluatorEdit, SimpleEvaluatorQuery, SimpleEvaluatorFlags, - SimpleEvaluatorQueryFlags, - # EvaluatorFlags, - EvaluatorQueryFlags, - # Evaluator, EvaluatorQuery, EvaluatorRevisionsLog, @@ -1435,11 +1425,33 @@ def _transfer_evaluator_revision_data( else None ) headers = None + # TODO: This function reconstructs output schemas from old evaluator settings. + # When fully migrating to the new workflow-based evaluator system, the output + # schema should be stored directly in the evaluator revision (workflow revision) + # at configuration time, rather than being inferred from settings here. + # For evaluators with dynamic outputs (auto_ai_critique, json_multi_field_match), + # the frontend/API should build and save the complete output schema when the + # user configures the evaluator. outputs_schema = None if str(old_evaluator.evaluator_key) == "auto_ai_critique": json_schema = old_evaluator.settings_values.get("json_schema", None) if json_schema and isinstance(json_schema, dict): outputs_schema = json_schema.get("schema", None) + # Handle json_multi_field_match with dynamic field-based properties + if str(old_evaluator.evaluator_key) == "json_multi_field_match": + # Build dynamic properties based on configured fields + fields = old_evaluator.settings_values.get("fields", []) + properties = {"aggregate_score": {"type": "number"}} + for field in fields: + # Each field becomes a numeric score (0 or 1) + properties[field] = {"type": "number"} + outputs_schema = { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": properties, + "required": ["aggregate_score"], + "additionalProperties": False, + } if not outputs_schema: properties = ( {"score": {"type": "number"}, "success": {"type": "boolean"}} diff --git a/api/oss/src/models/api/evaluation_model.py b/api/oss/src/models/api/evaluation_model.py index 82e9f35cd..dc006e11d 100644 --- a/api/oss/src/models/api/evaluation_model.py +++ b/api/oss/src/models/api/evaluation_model.py @@ -20,6 +20,7 @@ class LegacyEvaluator(BaseModel): oss: Optional[bool] = False requires_llm_api_keys: Optional[bool] = False tags: List[str] + archived: Optional[bool] = False class EvaluatorConfig(BaseModel): diff --git a/api/oss/src/resources/evaluators/evaluators.py b/api/oss/src/resources/evaluators/evaluators.py index 13bf9617c..1e0eacdc8 100644 --- a/api/oss/src/resources/evaluators/evaluators.py +++ b/api/oss/src/resources/evaluators/evaluators.py @@ -332,6 +332,7 @@ "name": "JSON Field Match", "key": "field_match_test", "direct_use": False, + "archived": True, # Deprecated - use json_multi_field_match instead "settings_template": { "json_field": { "label": "JSON Field", @@ -355,6 +356,33 @@ "oss": True, "tags": ["classifiers"], }, + { + "name": "JSON Multi-Field Match", + "key": "json_multi_field_match", + "direct_use": False, + "settings_template": { + "fields": { + "label": "Fields to Compare", + "type": "fields_tags_editor", # Custom type - tag-based add/remove editor + "required": True, + "description": "Add fields to compare using dot notation for nested paths (e.g., user.name)", + }, + "correct_answer_key": { + "label": "Expected Answer Column", + "default": "correct_answer", + "type": "string", + "required": True, + "description": "Column name containing the expected JSON object", + "ground_truth_key": True, + "advanced": True, # Hidden in advanced section + }, + }, + "description": "Compares configured fields in expected JSON against LLM output. Each field becomes a separate metric (0 or 1), with an aggregate_score showing the percentage of matching fields. Useful for entity extraction validation.", + "requires_testcase": "always", + "requires_trace": "always", + "oss": True, + "tags": ["classifiers"], + }, { "name": "JSON Diff Match", "key": "auto_json_diff", diff --git a/api/oss/src/services/evaluators_service.py b/api/oss/src/services/evaluators_service.py index 2545a303c..bb93982be 100644 --- a/api/oss/src/services/evaluators_service.py +++ b/api/oss/src/services/evaluators_service.py @@ -1,34 +1,30 @@ -import re import json +import re import traceback -from typing import Any, Dict, Union, List, Optional +from typing import Any, Dict, List, Optional, Union -import litellm import httpx +import litellm +from agenta.sdk.managers.secrets import SecretsManager from fastapi import HTTPException from openai import AsyncOpenAI - -# COMMENTED OUT: autoevals dependency removed -# from autoevals.ragas import Faithfulness, ContextRelevancy - -from oss.src.utils.logging import get_module_logger -from oss.src.services.security import sandbox -from oss.src.models.shared_models import Error, Result from oss.src.models.api.evaluation_model import ( EvaluatorInputInterface, - EvaluatorOutputInterface, EvaluatorMappingInputInterface, EvaluatorMappingOutputInterface, + EvaluatorOutputInterface, ) +from oss.src.models.shared_models import Error, Result +from oss.src.services.security import sandbox + +# COMMENTED OUT: autoevals dependency removed +# from autoevals.ragas import Faithfulness, ContextRelevancy +from oss.src.utils.logging import get_module_logger from oss.src.utils.traces import ( - remove_trace_prefix, - process_distributed_trace_into_trace_tree, get_field_value_from_trace_tree, + process_distributed_trace_into_trace_tree, ) -from agenta.sdk.managers.secrets import SecretsManager - - log = get_module_logger(__name__) @@ -253,7 +249,7 @@ async def auto_exact_match( message=str(e), ), ) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -352,6 +348,139 @@ async def field_match_test(input: EvaluatorInputInterface) -> EvaluatorOutputInt return {"outputs": {"success": result}} +def get_nested_value(obj: Any, path: str) -> Any: + """ + Get value from nested object using resolve_any() with graceful None on failure. + + Supports multiple path formats: + - Dot notation: "user.address.city", "items.0.name" + - JSON Path: "$.user.address.city", "$.items[0].name" + - JSON Pointer: "/user/address/city", "/items/0/name" + + Args: + obj: The object to traverse (dict or nested structure) + path: Path expression in any supported format + + Returns: + The value at the specified path, or None if path doesn't exist or resolution fails + """ + if obj is None: + return None + + try: + return resolve_any(path, obj) + except (KeyError, IndexError, ValueError, TypeError, ImportError): + return None + + +async def auto_json_multi_field_match( + inputs: Dict[str, Any], # pylint: disable=unused-argument + output: Union[str, Dict[str, Any]], + data_point: Dict[str, Any], + app_params: Dict[str, Any], # pylint: disable=unused-argument + settings_values: Dict[str, Any], + lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument +) -> Result: + """ + Evaluator that compares multiple configured fields in expected JSON against LLM output JSON. + Each configured field becomes a separate score in the output. + + Returns a Result with: + - type="object" containing one score per configured field plus overall score + - Each field score is 1.0 (match) or 0.0 (no match) + - Overall 'score' is the average of all field scores + """ + try: + output = validate_string_output("json_multi_field_match", output) + correct_answer = get_correct_answer(data_point, settings_values) + eval_inputs = {"ground_truth": correct_answer, "prediction": output} + response = await json_multi_field_match( + input=EvaluatorInputInterface( + **{"inputs": eval_inputs, "settings": settings_values} + ) + ) + return Result(type="object", value=response["outputs"]) + except ValueError as e: + return Result( + type="error", + value=None, + error=Error( + message=str(e), + ), + ) + except Exception: + return Result( + type="error", + value=None, + error=Error( + message="Error during JSON Multi-Field Match evaluation", + stacktrace=str(traceback.format_exc()), + ), + ) + + +async def json_multi_field_match( + input: EvaluatorInputInterface, +) -> EvaluatorOutputInterface: + """ + Compare configured fields in expected JSON against LLM output JSON. + Each configured field becomes a separate score in the output. + + Args: + input: EvaluatorInputInterface with: + - inputs.prediction: JSON string from LLM output + - inputs.ground_truth: JSON string from test data column + - settings.fields: List of field paths (strings) e.g., ["name", "email", "user.address.city"] + + Returns: + EvaluatorOutputInterface with one score per configured field plus overall score + """ + fields = input.settings.get("fields", []) + + if not fields: + raise ValueError("No fields configured for comparison") + + # Parse both JSON objects + prediction = input.inputs.get("prediction", "") + ground_truth = input.inputs.get("ground_truth", "") + + try: + if isinstance(ground_truth, str): + expected = json.loads(ground_truth) + else: + expected = ground_truth + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in ground truth: {str(e)}") + + try: + if isinstance(prediction, str): + actual = json.loads(prediction) + else: + actual = prediction + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in prediction: {str(e)}") + + results: Dict[str, Any] = {} + matches = 0 + + for field_path in fields: + # Support nested fields with dot notation + expected_val = get_nested_value(expected, field_path) + actual_val = get_nested_value(actual, field_path) + + # Exact match comparison (v1 - always exact) + match = expected_val == actual_val + + results[field_path] = 1.0 if match else 0.0 + if match: + matches += 1 + + # Aggregate score is the percentage of matching fields + results["aggregate_score"] = matches / len(fields) if fields else 0.0 + + return {"outputs": results} + + async def auto_webhook_test( inputs: Dict[str, Any], output: Union[str, Dict[str, Any]], @@ -435,7 +564,7 @@ async def auto_custom_code_run( ) ) return Result(type="number", value=response["outputs"]["score"]) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -504,7 +633,7 @@ async def auto_ai_critique( ) ) return Result(type="number", value=response["outputs"]["score"]) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -515,9 +644,7 @@ async def auto_ai_critique( ) -import json -import re -from typing import Any, Dict, Iterable, Tuple, Optional +from typing import Any, Dict, Iterable, Tuple try: import jsonpath # ✅ use module API @@ -1154,7 +1281,7 @@ async def auto_starts_with( ) ) return Result(type="bool", value=response["outputs"]["success"]) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1196,7 +1323,7 @@ async def auto_ends_with( ) result = Result(type="bool", value=response["outputs"]["success"]) return result - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1238,7 +1365,7 @@ async def auto_contains( ) result = Result(type="bool", value=response["outputs"]["success"]) return result - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1280,7 +1407,7 @@ async def auto_contains_any( ) result = Result(type="bool", value=response["outputs"]["success"]) return result - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1323,7 +1450,7 @@ async def auto_contains_all( ) result = Result(type="bool", value=response["outputs"]["success"]) return result - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1371,7 +1498,7 @@ async def auto_contains_json( input=EvaluatorInputInterface(**{"inputs": {"prediction": output}}) ) return Result(type="bool", value=response["outputs"]["success"]) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1389,7 +1516,7 @@ async def contains_json(input: EvaluatorInputInterface) -> EvaluatorOutputInterf potential_json = str(input.inputs["prediction"])[start_index:end_index] json.loads(potential_json) contains_json = True - except (ValueError, json.JSONDecodeError) as e: + except (ValueError, json.JSONDecodeError): contains_json = False return {"outputs": {"success": contains_json}} @@ -1852,7 +1979,7 @@ async def auto_levenshtein_distance( message=str(e), ), ) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1892,7 +2019,7 @@ async def auto_similarity_match( message=str(e), ), ) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -2002,6 +2129,7 @@ async def auto_semantic_similarity( "auto_exact_match": auto_exact_match, "auto_regex_test": auto_regex_test, "field_match_test": auto_field_match_test, + "json_multi_field_match": auto_json_multi_field_match, "auto_webhook_test": auto_webhook_test, "auto_custom_code_run": auto_custom_code_run, "auto_ai_critique": auto_ai_critique, @@ -2024,6 +2152,7 @@ async def auto_semantic_similarity( "auto_exact_match": exact_match, "auto_regex_test": regex_test, "field_match_test": field_match_test, + "json_multi_field_match": json_multi_field_match, "auto_webhook_test": webhook_test, "auto_custom_code_run": custom_code_run, "auto_ai_critique": ai_critique, diff --git a/sdk/agenta/sdk/workflows/configurations.py b/sdk/agenta/sdk/workflows/configurations.py index 9086047c5..42310b936 100644 --- a/sdk/agenta/sdk/workflows/configurations.py +++ b/sdk/agenta/sdk/workflows/configurations.py @@ -5,6 +5,7 @@ auto_exact_match_v0_configuration = WorkflowServiceConfiguration() auto_regex_test_v0_configuration = WorkflowServiceConfiguration() field_match_test_v0_configuration = WorkflowServiceConfiguration() +json_multi_field_match_v0_configuration = WorkflowServiceConfiguration() auto_webhook_test_v0_configuration = WorkflowServiceConfiguration() auto_custom_code_run_v0_configuration = WorkflowServiceConfiguration() auto_ai_critique_v0_configuration = WorkflowServiceConfiguration() diff --git a/sdk/agenta/sdk/workflows/handlers.py b/sdk/agenta/sdk/workflows/handlers.py index b7b28080b..3ef2faf3a 100644 --- a/sdk/agenta/sdk/workflows/handlers.py +++ b/sdk/agenta/sdk/workflows/handlers.py @@ -1,14 +1,14 @@ -from typing import List, Any, Optional, Any, Dict, Union -from json import dumps, loads -import traceback import json -import re import math +import re +import traceback +from difflib import SequenceMatcher +from json import dumps, loads +from typing import Any, Dict, List, Optional, Union import httpx from pydantic import BaseModel, Field -from difflib import SequenceMatcher from agenta.sdk.utils.logging import get_module_logger from agenta.sdk.utils.lazy import ( @@ -21,33 +21,30 @@ from agenta.sdk.litellm import mockllm from agenta.sdk.types import PromptTemplate, Message from agenta.sdk.managers.secrets import SecretsManager - from agenta.sdk.decorators.tracing import instrument - +from agenta.sdk.litellm.litellm import litellm_handler from agenta.sdk.models.shared import Data -from agenta.sdk.models.tracing import Trace -from agenta.sdk.workflows.sandbox import execute_code_safely from agenta.sdk.workflows.errors import ( + CustomCodeServerV0Error, InvalidConfigurationParametersV0Error, - MissingConfigurationParameterV0Error, InvalidConfigurationParameterV0Error, InvalidInputsV0Error, - MissingInputV0Error, InvalidInputV0Error, InvalidOutputsV0Error, - MissingOutputV0Error, InvalidSecretsV0Error, JSONDiffV0Error, LevenshteinDistanceV0Error, - SyntacticSimilarityV0Error, + MissingConfigurationParameterV0Error, + MissingInputV0Error, + PromptCompletionV0Error, + PromptFormattingV0Error, + RegexPatternV0Error, SemanticSimilarityV0Error, - WebhookServerV0Error, + SyntacticSimilarityV0Error, WebhookClientV0Error, - CustomCodeServerV0Error, - RegexPatternV0Error, - PromptFormattingV0Error, - PromptCompletionV0Error, + WebhookServerV0Error, ) +from agenta.sdk.workflows.sandbox import execute_code_safely log = get_module_logger(__name__) @@ -57,7 +54,6 @@ def _configure_litellm(): litellm = _load_litellm() if not litellm: raise ImportError("litellm is required for completion handling.") - from agenta.sdk.litellm.litellm import litellm_handler litellm.logging = False litellm.set_verbose = False @@ -85,9 +81,7 @@ def _compute_similarity(embedding_1: List[float], embedding_2: List[float]) -> f return dot / (norm1 * norm2) -import json -import re -from typing import Any, Dict, Iterable, Tuple, Optional +from typing import Any, Iterable, Tuple # ========= Scheme detection ========= @@ -392,7 +386,7 @@ def auto_exact_match_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -400,7 +394,7 @@ def auto_exact_match_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -437,7 +431,7 @@ def auto_regex_test_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "regex_pattern" in parameters: + if "regex_pattern" not in parameters: raise MissingConfigurationParameterV0Error(path="regex_pattern") regex_pattern = parameters["regex_pattern"] @@ -495,12 +489,12 @@ def field_match_test_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "json_field" in parameters: + if "json_field" not in parameters: raise MissingConfigurationParameterV0Error(path="json_field") json_field = str(parameters["json_field"]) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -508,7 +502,7 @@ def field_match_test_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -521,7 +515,7 @@ def field_match_test_v0( if isinstance(outputs, str): try: outputs_dict = loads(outputs) - except json.JSONDecodeError as e: + except json.JSONDecodeError: # raise InvalidOutputsV0Error(expected="dict", got=outputs) from e return {"success": False} @@ -529,7 +523,7 @@ def field_match_test_v0( # raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs) return {"success": False} - if not json_field in outputs_dict: + if json_field not in outputs_dict: # raise MissingOutputV0Error(path=json_field) return {"success": False} @@ -540,6 +534,148 @@ def field_match_test_v0( return {"success": success} +def _get_nested_value(obj: Any, path: str) -> Any: + """ + Get value from nested object using resolve_any() with graceful None on failure. + + Supports multiple path formats: + - Dot notation: "user.address.city", "items.0.name" + - JSON Path: "$.user.address.city", "$.items[0].name" + - JSON Pointer: "/user/address/city", "/items/0/name" + + Args: + obj: The object to traverse (dict or list) + path: Path expression in any supported format + + Returns: + The value at the path, or None if path doesn't exist or resolution fails + """ + if obj is None: + return None + + try: + return resolve_any(path, obj) + except (KeyError, IndexError, ValueError, TypeError, ImportError): + return None + + +@instrument(annotate=True) +def json_multi_field_match_v0( + parameters: Optional[Data] = None, + inputs: Optional[Data] = None, + outputs: Optional[Union[Data, str]] = None, +) -> Any: + """ + Multi-field JSON match evaluator for comparing multiple fields between expected and actual JSON. + + Each configured field becomes a separate score (0 or 1), and an aggregate_score shows + the percentage of matching fields. Useful for entity extraction validation. + + Args: + inputs: Testcase data with ground truth JSON + outputs: Output from the workflow execution (expected to be JSON string or dict) + parameters: Configuration with: + - fields: List of field paths to compare (e.g., ["name", "user.address.city"]) + - correct_answer_key: Key in inputs containing the expected JSON + + Returns: + Dict with per-field scores and aggregate_score, e.g.: + {"name": 1.0, "email": 0.0, "aggregate_score": 0.5} + """ + if parameters is None or not isinstance(parameters, dict): + raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) + + if "fields" not in parameters: + raise MissingConfigurationParameterV0Error(path="fields") + + fields = parameters["fields"] + + if not isinstance(fields, list) or len(fields) == 0: + raise InvalidConfigurationParameterV0Error( + path="fields", + expected="non-empty list", + got=fields, + ) + + if "correct_answer_key" not in parameters: + raise MissingConfigurationParameterV0Error(path="correct_answer_key") + + correct_answer_key = str(parameters["correct_answer_key"]) + + if inputs is None or not isinstance(inputs, dict): + raise InvalidInputsV0Error(expected="dict", got=inputs) + + if correct_answer_key not in inputs: + raise MissingInputV0Error(path=correct_answer_key) + + correct_answer = inputs[correct_answer_key] + + # Parse ground truth JSON + if isinstance(correct_answer, str): + try: + expected = json.loads(correct_answer) + except json.JSONDecodeError: + raise InvalidInputV0Error( + path=correct_answer_key, + expected="valid JSON string", + got=correct_answer, + ) + elif isinstance(correct_answer, dict): + expected = correct_answer + else: + raise InvalidInputV0Error( + path=correct_answer_key, + expected=["dict", "str"], + got=correct_answer, + ) + + # Parse output JSON + if not isinstance(outputs, str) and not isinstance(outputs, dict): + # Return all zeros if output is invalid + results: Dict[str, Any] = {field: 0.0 for field in fields} + results["aggregate_score"] = 0.0 + return results + + if isinstance(outputs, str): + try: + actual = json.loads(outputs) + except json.JSONDecodeError: + # Return all zeros if output is not valid JSON + results = {field: 0.0 for field in fields} + results["aggregate_score"] = 0.0 + return results + else: + actual = outputs + + if not isinstance(actual, dict): + # Return all zeros if parsed output is not a dict + results = {field: 0.0 for field in fields} + results["aggregate_score"] = 0.0 + return results + + # -------------------------------------------------------------------------- + # Compare each configured field + results = {} + matches = 0 + + for field_path in fields: + expected_val = _get_nested_value(expected, field_path) + actual_val = _get_nested_value(actual, field_path) + + # Exact match comparison + match = expected_val == actual_val + + results[field_path] = 1.0 if match else 0.0 + if match: + matches += 1 + + # Aggregate score is the percentage of matching fields + results["aggregate_score"] = matches / len(fields) if fields else 0.0 + # -------------------------------------------------------------------------- + + return results + + @instrument(annotate=True) async def auto_webhook_test_v0( parameters: Optional[Data] = None, @@ -560,12 +696,12 @@ async def auto_webhook_test_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "webhook_url" in parameters: + if "webhook_url" not in parameters: raise MissingConfigurationParameterV0Error(path="webhook_url") webhook_url = str(parameters["webhook_url"]) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -573,7 +709,7 @@ async def auto_webhook_test_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -665,12 +801,12 @@ async def auto_custom_code_run_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "code" in parameters: + if "code" not in parameters: raise MissingConfigurationParameterV0Error(path="code") code = str(parameters["code"]) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -678,7 +814,7 @@ async def auto_custom_code_run_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -756,7 +892,7 @@ async def auto_ai_critique_v0( correct_answer_key = parameters.get("correct_answer_key") - if not "prompt_template" in parameters: + if "prompt_template" not in parameters: raise MissingConfigurationParameterV0Error(path="prompt_template") prompt_template = parameters.get("prompt_template") @@ -787,7 +923,7 @@ async def auto_ai_critique_v0( "json_schema" if template_version == "4" else "text" ) - if not response_type in ["text", "json_object", "json_schema"]: + if response_type not in ["text", "json_object", "json_schema"]: raise InvalidConfigurationParameterV0Error( path="response_type", expected=["text", "json_object", "json_schema"], @@ -992,7 +1128,7 @@ def auto_starts_with_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "prefix" in parameters: + if "prefix" not in parameters: raise MissingConfigurationParameterV0Error(path="prefix") prefix = parameters["prefix"] @@ -1041,7 +1177,7 @@ def auto_ends_with_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "suffix" in parameters: + if "suffix" not in parameters: raise MissingConfigurationParameterV0Error(path="suffix") suffix = parameters["suffix"] @@ -1090,7 +1226,7 @@ def auto_contains_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "substring" in parameters: + if "substring" not in parameters: raise MissingConfigurationParameterV0Error(path="substring") substring = parameters["substring"] @@ -1139,7 +1275,7 @@ def auto_contains_any_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "substrings" in parameters: + if "substrings" not in parameters: raise MissingConfigurationParameterV0Error(path="substrings") substrings = parameters["substrings"] @@ -1197,7 +1333,7 @@ def auto_contains_all_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "substrings" in parameters: + if "substrings" not in parameters: raise MissingConfigurationParameterV0Error(path="substrings") substrings = parameters["substrings"] @@ -1297,7 +1433,7 @@ def auto_json_diff_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -1305,7 +1441,7 @@ def auto_json_diff_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -1389,7 +1525,7 @@ def auto_levenshtein_distance_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -1399,7 +1535,7 @@ def auto_levenshtein_distance_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -1494,7 +1630,7 @@ def auto_similarity_match_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -1504,7 +1640,7 @@ def auto_similarity_match_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -1587,7 +1723,7 @@ async def auto_semantic_similarity_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -1600,7 +1736,7 @@ async def auto_semantic_similarity_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -1703,7 +1839,7 @@ async def completion_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "prompt" in parameters: + if "prompt" not in parameters: raise MissingConfigurationParameterV0Error(path="prompt") params: Dict[str, Any] = {**(parameters or {})} diff --git a/sdk/agenta/sdk/workflows/interfaces.py b/sdk/agenta/sdk/workflows/interfaces.py index 85334ab6c..6c1e5edfb 100644 --- a/sdk/agenta/sdk/workflows/interfaces.py +++ b/sdk/agenta/sdk/workflows/interfaces.py @@ -169,6 +169,53 @@ ), ) +json_multi_field_match_v0_interface = WorkflowServiceInterface( + uri="agenta:built-in:json_multi_field_match:v0", + schemas=dict( # type: ignore + parameters={ + "type": "object", + "title": "JSON Multi-Field Match Parameters", + "description": "Settings for comparing multiple JSON fields against expected values from a ground truth column.", + "properties": { + "correct_answer_key": { + "type": "string", + "title": "Ground Truth Column", + "description": "Column in test data containing the JSON ground truth.", + "default": "correct_answer", + }, + "fields": { + "type": "array", + "title": "Fields to Compare", + "description": "List of JSON field paths (dot notation) to compare. Each field becomes a separate score.", + "items": {"type": "string"}, + "default": [], + }, + }, + "required": ["correct_answer_key", "fields"], + "additionalProperties": False, + }, + inputs={ + "type": "object", + "title": "JSON Multi-Field Match Inputs", + "description": "Testcase data including the JSON ground truth.", + }, + outputs={ + "type": "object", + "title": "JSON Multi-Field Match Outputs", + "description": "Per-field match scores and aggregate score. Each field produces a 0 or 1 output.", + "properties": { + "aggregate_score": { + "type": "number", + "title": "Aggregate Score", + "description": "Percentage of matched fields (0-1).", + }, + }, + "required": ["aggregate_score"], + "additionalProperties": True, # Allows dynamic field outputs + }, + ), +) + auto_webhook_test_v0_interface = WorkflowServiceInterface( uri="agenta:built-in:auto_webhook_test:v0", schemas=dict( # type: ignore diff --git a/sdk/agenta/sdk/workflows/utils.py b/sdk/agenta/sdk/workflows/utils.py index d86f499da..2ecd57d21 100644 --- a/sdk/agenta/sdk/workflows/utils.py +++ b/sdk/agenta/sdk/workflows/utils.py @@ -9,6 +9,7 @@ auto_exact_match_v0, auto_regex_test_v0, field_match_test_v0, + json_multi_field_match_v0, auto_webhook_test_v0, auto_custom_code_run_v0, auto_ai_critique_v0, @@ -31,6 +32,7 @@ auto_exact_match_v0_interface, auto_regex_test_v0_interface, field_match_test_v0_interface, + json_multi_field_match_v0_interface, auto_webhook_test_v0_interface, auto_custom_code_run_v0_interface, auto_ai_critique_v0_interface, @@ -54,6 +56,7 @@ auto_exact_match_v0_configuration, auto_regex_test_v0_configuration, field_match_test_v0_configuration, + json_multi_field_match_v0_configuration, auto_webhook_test_v0_configuration, auto_custom_code_run_v0_configuration, auto_ai_critique_v0_configuration, @@ -78,6 +81,7 @@ auto_exact_match=dict(v0=auto_exact_match_v0_interface), auto_regex_test=dict(v0=auto_regex_test_v0_interface), field_match_test=dict(v0=field_match_test_v0_interface), + json_multi_field_match=dict(v0=json_multi_field_match_v0_interface), auto_webhook_test=dict(v0=auto_webhook_test_v0_interface), auto_custom_code_run=dict(v0=auto_custom_code_run_v0_interface), auto_ai_critique=dict(v0=auto_ai_critique_v0_interface), @@ -104,6 +108,7 @@ auto_exact_match=dict(v0=auto_exact_match_v0_configuration), auto_regex_test=dict(v0=auto_regex_test_v0_configuration), field_match_test=dict(v0=field_match_test_v0_configuration), + json_multi_field_match=dict(v0=json_multi_field_match_v0_configuration), auto_webhook_test=dict(v0=auto_webhook_test_v0_configuration), auto_custom_code_run=dict(v0=auto_custom_code_run_v0_configuration), auto_ai_critique=dict(v0=auto_ai_critique_v0_configuration), @@ -160,6 +165,7 @@ auto_exact_match=dict(v0=auto_exact_match_v0), auto_regex_test=dict(v0=auto_regex_test_v0), field_match_test=dict(v0=field_match_test_v0), + json_multi_field_match=dict(v0=json_multi_field_match_v0), auto_webhook_test=dict(v0=auto_webhook_test_v0), auto_custom_code_run=dict(v0=auto_custom_code_run_v0), auto_ai_critique=dict(v0=auto_ai_critique_v0), diff --git a/web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts b/web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts index ec01f427c..ff617479c 100644 --- a/web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts +++ b/web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts @@ -20,6 +20,7 @@ export const ENABLED_EVALUATORS = [ "auto_semantic_similarity", "auto_regex_test", "field_match_test", + "json_multi_field_match", "auto_json_diff", "auto_ai_critique", "auto_custom_code_run", diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx index 3098026f1..85f09fd17 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx @@ -11,6 +11,7 @@ import {isValidRegex} from "@/oss/lib/helpers/validators" import {generatePaths} from "@/oss/lib/transformers" import {EvaluationSettingsTemplate, JSSTheme} from "@/oss/lib/Types" +import {FieldsTagsEditor} from "./FieldsTagsEditor" import {JSONSchemaEditor} from "./JSONSchema" import {Messages} from "./Messages" @@ -215,6 +216,8 @@ export const DynamicFormField: React.FC = ({ : JSON.stringify(savedValue ?? {}, null, 2) } /> + ) : type === "fields_tags_editor" ? ( + ) : null} )} diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx new file mode 100644 index 000000000..a96a07a37 --- /dev/null +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx @@ -0,0 +1,228 @@ +/** + * FieldsTagsEditor - Tag-based editor for JSON field paths + * + * This component provides an add/remove interface for managing JSON field paths. + * Users can: + * - Add fields manually using an input field (supports dot notation for nested paths) + * - Remove fields by clicking the X button on tags + * - Detect fields from the selected testcase using a dedicated button + * + * The component also displays a non-removable "overall" field representing + * the aggregate result across all fields. + * + * Auto-detection behavior: + * - When a testcase is loaded and no fields are configured, fields are auto-detected + */ + +import {useCallback, useEffect, useMemo, useRef, useState} from "react" + +import {PlusOutlined, SearchOutlined} from "@ant-design/icons" +import {Button, Form, Input, Tag, Tooltip, Typography} from "antd" +import type {FormInstance} from "antd/es/form" +import {useAtomValue} from "jotai" + +import {extractJsonPaths, safeParseJson} from "@/oss/lib/helpers/extractJsonPaths" + +import {playgroundSelectedTestcaseAtom} from "./state/atoms" + +const {Text} = Typography + +interface FieldsTagsEditorProps { + value?: string[] + onChange?: (value: string[]) => void + form?: FormInstance + name?: string | string[] + correctAnswerKey?: string +} + +/** + * Tag-based editor for managing JSON field paths with add/remove functionality. + * Includes "Detect from testcase" feature to auto-populate fields. + */ +export const FieldsTagsEditor: React.FC = ({ + value = [], + onChange, + form, + correctAnswerKey = "correct_answer", +}) => { + const [inputValue, setInputValue] = useState("") + // Track if we've already auto-detected to avoid re-triggering + const hasAutoDetectedRef = useRef(false) + + // Read the selected testcase from the playground atom + const testcaseSelection = useAtomValue(playgroundSelectedTestcaseAtom) + const testcase = testcaseSelection?.testcase + + // Watch the correct_answer_key from form to react to changes + // Using Form.useWatch instead of form.getFieldValue for reactivity + const formCorrectAnswerKey = Form.useWatch(["settings_values", "correct_answer_key"], form) + const effectiveKey = formCorrectAnswerKey || correctAnswerKey + + // Check if we can detect fields from testcase + const canDetectFields = useMemo(() => { + if (!testcase) return false + const groundTruthValue = testcase[effectiveKey] + if (!groundTruthValue) return false + const parsed = safeParseJson(groundTruthValue) + return parsed !== null + }, [testcase, effectiveKey]) + + // Extract available fields from the testcase + const detectableFields = useMemo(() => { + if (!testcase) return [] + const groundTruthValue = testcase[effectiveKey] + if (!groundTruthValue) return [] + const parsed = safeParseJson(groundTruthValue) + if (!parsed) return [] + return extractJsonPaths(parsed) + }, [testcase, effectiveKey]) + + // Auto-detect fields when testcase is loaded and no fields are configured + useEffect(() => { + // Only auto-detect if: + // 1. We haven't already auto-detected + // 2. There are no user-defined fields + // 3. We can detect fields from the testcase + if (!hasAutoDetectedRef.current && value.length === 0 && detectableFields.length > 0) { + hasAutoDetectedRef.current = true + onChange?.(detectableFields) + } + }, [detectableFields, value.length, onChange]) + + // Handle adding a new field + const handleAddField = useCallback(() => { + const trimmed = inputValue.trim() + if (!trimmed) return + + // Don't add duplicates + if (value.includes(trimmed)) { + setInputValue("") + return + } + + // Don't allow reserved field names + if (trimmed === "aggregate_score") { + setInputValue("") + return + } + + onChange?.([...value, trimmed]) + setInputValue("") + }, [inputValue, value, onChange]) + + // Handle removing a field + const handleRemoveField = useCallback( + (fieldToRemove: string) => { + onChange?.(value.filter((f) => f !== fieldToRemove)) + }, + [value, onChange], + ) + + // Handle detecting fields from testcase (replaces existing fields) + const handleDetectFields = useCallback(() => { + if (detectableFields.length > 0) { + onChange?.(detectableFields) + } + }, [detectableFields, onChange]) + + // Handle Enter key in input + const handleInputKeyDown = useCallback( + (e: React.KeyboardEvent) => { + if (e.key === "Enter") { + e.preventDefault() + handleAddField() + } + }, + [handleAddField], + ) + + // Generate tooltip for disabled detect button + const detectButtonTooltip = useMemo(() => { + if (!testcase) { + return "Select a testcase first to detect fields" + } + if (!canDetectFields) { + return `No JSON object found in the "${effectiveKey}" column` + } + return `Detect ${detectableFields.length} field(s) from testcase (replaces current fields)` + }, [testcase, canDetectFields, effectiveKey, detectableFields.length]) + + return ( +
+ {/* Field Tags Display */} +
+ {/* Non-removable aggregate_score tag */} + + + aggregate_score + + + + {/* User-defined field tags */} + {value.map((field) => ( + handleRemoveField(field)} + className="flex items-center font-mono text-[13px] !m-0" + > + {field} + + ))} + + {/* Empty state message */} + {value.length === 0 && ( + + Add fields to compare or detect them from a testcase + + )} +
+ + {/* Add Field Input */} +
+ setInputValue(e.target.value)} + onKeyDown={handleInputKeyDown} + suffix={ + + + ? + + + } + /> + +
+ + {/* Actions Row */} +
+ + Each field creates a column with value 0 (no match) or 1 (match) + + + + + +
+
+ ) +} + +export default FieldsTagsEditor diff --git a/web/oss/src/lib/Types.ts b/web/oss/src/lib/Types.ts index 656a5fe08..cfcb4ee49 100644 --- a/web/oss/src/lib/Types.ts +++ b/web/oss/src/lib/Types.ts @@ -991,6 +991,7 @@ type ValueTypeOptions = | "messages" | "multiple_choice" | "llm_response_schema" + | "fields_checkbox_list" export interface EvaluationSettingsTemplate { type: ValueTypeOptions diff --git a/web/oss/src/lib/helpers/extractJsonPaths.ts b/web/oss/src/lib/helpers/extractJsonPaths.ts new file mode 100644 index 000000000..62176319e --- /dev/null +++ b/web/oss/src/lib/helpers/extractJsonPaths.ts @@ -0,0 +1,86 @@ +/** + * Utility functions for extracting JSON paths from objects. + * Used by the JSON Multi-Field Match evaluator to auto-detect fields from testcase data. + */ + +/** + * Recursively extracts all leaf paths from a JSON object using dot notation. + * + * Example: + * Input: {user: {name: "John", address: {city: "NYC"}}} + * Output: ["user.name", "user.address.city"] + * + * @param obj - The object to extract paths from + * @param prefix - Current path prefix (used for recursion) + * @returns Array of dot-notation paths to all leaf values + */ +export const extractJsonPaths = (obj: unknown, prefix = ""): string[] => { + if (obj === null || obj === undefined) return [] + if (typeof obj !== "object") return prefix ? [prefix] : [] + + // For arrays, we don't expand individual indices - just mark the path + // This keeps the UI manageable and matches common use cases + if (Array.isArray(obj)) { + return prefix ? [prefix] : [] + } + + const paths: string[] = [] + + for (const key of Object.keys(obj as Record)) { + const newPrefix = prefix ? `${prefix}.${key}` : key + const value = (obj as Record)[key] + + if (value !== null && typeof value === "object" && !Array.isArray(value)) { + // Recurse into nested objects + paths.push(...extractJsonPaths(value, newPrefix)) + } else { + // Leaf node (primitive, array, or null) + paths.push(newPrefix) + } + } + + return paths +} + +/** + * Parses a JSON string and extracts all paths. + * Returns empty array if parsing fails. + * + * @param jsonString - JSON string to parse and extract paths from + * @returns Array of dot-notation paths + */ +export const extractJsonPathsFromString = (jsonString: string): string[] => { + try { + const parsed = JSON.parse(jsonString) + return extractJsonPaths(parsed) + } catch { + return [] + } +} + +/** + * Safely parses a value that might be JSON string or already an object. + * + * @param value - Value to parse (string or object) + * @returns Parsed object or null if invalid + */ +export const safeParseJson = (value: unknown): Record | null => { + if (value === null || value === undefined) return null + + if (typeof value === "object" && !Array.isArray(value)) { + return value as Record + } + + if (typeof value === "string") { + try { + const parsed = JSON.parse(value) + if (typeof parsed === "object" && parsed !== null && !Array.isArray(parsed)) { + return parsed + } + } catch { + return null + } + } + + return null +} diff --git a/web/oss/src/services/evaluators/index.ts b/web/oss/src/services/evaluators/index.ts index f8576b9bc..2a9bb15de 100644 --- a/web/oss/src/services/evaluators/index.ts +++ b/web/oss/src/services/evaluators/index.ts @@ -67,6 +67,7 @@ const evaluatorIconsMap = { auto_similarity_match: similarityImg, auto_regex_test: regexImg, field_match_test: exactMatchImg, + json_multi_field_match: bracketCurlyImg, auto_webhook_test: webhookImg, auto_ai_critique: aiImg, auto_custom_code_run: codeImg,