From 2520a198136aa42991ff7870bd2258e204b45be1 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Mon, 22 Dec 2025 20:48:05 +0100 Subject: [PATCH 01/11] feat(api): add archived field to LegacyEvaluator model --- api/oss/src/models/api/evaluation_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/api/oss/src/models/api/evaluation_model.py b/api/oss/src/models/api/evaluation_model.py index a914ff7c7b..a17ca315ad 100644 --- a/api/oss/src/models/api/evaluation_model.py +++ b/api/oss/src/models/api/evaluation_model.py @@ -20,6 +20,7 @@ class LegacyEvaluator(BaseModel): oss: Optional[bool] = False requires_llm_api_keys: Optional[bool] = False tags: List[str] + archived: Optional[bool] = False class EvaluatorConfig(BaseModel): From fb13c912dafe7a2d74e418e8741510f88264bf93 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Mon, 22 Dec 2025 20:48:13 +0100 Subject: [PATCH 02/11] feat(api): introduce JSON Multi-Field Match evaluator and deprecate JSON Field Match --- .../src/resources/evaluators/evaluators.py | 28 ++++ api/oss/src/services/evaluators_service.py | 143 ++++++++++++++++++ 2 files changed, 171 insertions(+) diff --git a/api/oss/src/resources/evaluators/evaluators.py b/api/oss/src/resources/evaluators/evaluators.py index 89d9974f8d..c6b2b189d9 100644 --- a/api/oss/src/resources/evaluators/evaluators.py +++ b/api/oss/src/resources/evaluators/evaluators.py @@ -332,6 +332,7 @@ "name": "JSON Field Match", "key": "field_match_test", "direct_use": False, + "archived": True, # Deprecated - use json_multi_field_match instead "settings_template": { "json_field": { "label": "JSON Field", @@ -355,6 +356,33 @@ "oss": True, "tags": ["classifiers"], }, + { + "name": "JSON Multi-Field Match", + "key": "json_multi_field_match", + "direct_use": False, + "settings_template": { + "fields": { + "label": "Fields to Compare", + "type": "fields_checkbox_list", # Custom type - checkbox list with auto-detection from testcase + "required": True, + "description": "Select which JSON fields to compare (auto-detected from testcase)", + }, + "correct_answer_key": { + "label": "Expected Answer Column", + "default": "correct_answer", + "type": "string", + "required": True, + "description": "Column name containing the expected JSON object", + "ground_truth_key": True, + "advanced": True, # Hidden in advanced section + }, + }, + "description": "Compares configured fields in expected JSON against LLM output. Each field becomes a separate score column (0 or 1), with an overall score showing the match ratio. Useful for entity extraction validation.", + "requires_testcase": "always", + "requires_trace": "always", + "oss": True, + "tags": ["classifiers"], + }, { "name": "JSON Diff Match", "key": "auto_json_diff", diff --git a/api/oss/src/services/evaluators_service.py b/api/oss/src/services/evaluators_service.py index b866db4b16..25505e4029 100644 --- a/api/oss/src/services/evaluators_service.py +++ b/api/oss/src/services/evaluators_service.py @@ -352,6 +352,147 @@ async def field_match_test(input: EvaluatorInputInterface) -> EvaluatorOutputInt return {"outputs": {"success": result}} +def get_nested_value(obj: Any, path: str) -> Any: + """ + Get value from nested dict/object using dot notation (e.g., 'user.address.city'). + + Args: + obj: The object to traverse (dict or nested structure) + path: Dot-notation path to the value (e.g., 'user.address.city') + + Returns: + The value at the specified path, or None if path doesn't exist + """ + if obj is None: + return None + + keys = path.split(".") + value = obj + + for key in keys: + if isinstance(value, dict): + value = value.get(key) + elif isinstance(value, list) and key.isdigit(): + # Support array indexing with numeric keys + idx = int(key) + value = value[idx] if 0 <= idx < len(value) else None + else: + return None + + if value is None: + return None + + return value + + +async def auto_json_multi_field_match( + inputs: Dict[str, Any], # pylint: disable=unused-argument + output: Union[str, Dict[str, Any]], + data_point: Dict[str, Any], + app_params: Dict[str, Any], # pylint: disable=unused-argument + settings_values: Dict[str, Any], + lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument +) -> Result: + """ + Evaluator that compares multiple configured fields in expected JSON against LLM output JSON. + Each configured field becomes a separate score in the output. + + Returns a Result with: + - type="object" containing one score per configured field plus overall score + - Each field score is 1.0 (match) or 0.0 (no match) + - Overall 'score' is the average of all field scores + """ + try: + output = validate_string_output("json_multi_field_match", output) + correct_answer = get_correct_answer(data_point, settings_values) + eval_inputs = {"ground_truth": correct_answer, "prediction": output} + response = await json_multi_field_match( + input=EvaluatorInputInterface( + **{"inputs": eval_inputs, "settings": settings_values} + ) + ) + return Result(type="object", value=response["outputs"]) + except ValueError as e: + return Result( + type="error", + value=None, + error=Error( + message=str(e), + ), + ) + except Exception: + return Result( + type="error", + value=None, + error=Error( + message="Error during JSON Multi-Field Match evaluation", + stacktrace=str(traceback.format_exc()), + ), + ) + + +async def json_multi_field_match( + input: EvaluatorInputInterface, +) -> EvaluatorOutputInterface: + """ + Compare configured fields in expected JSON against LLM output JSON. + Each configured field becomes a separate score in the output. + + Args: + input: EvaluatorInputInterface with: + - inputs.prediction: JSON string from LLM output + - inputs.ground_truth: JSON string from test data column + - settings.fields: List of field paths (strings) e.g., ["name", "email", "user.address.city"] + + Returns: + EvaluatorOutputInterface with one score per configured field plus overall score + """ + fields = input.settings.get("fields", []) + + if not fields: + raise ValueError("No fields configured for comparison") + + # Parse both JSON objects + prediction = input.inputs.get("prediction", "") + ground_truth = input.inputs.get("ground_truth", "") + + try: + if isinstance(ground_truth, str): + expected = json.loads(ground_truth) + else: + expected = ground_truth + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in ground truth: {str(e)}") + + try: + if isinstance(prediction, str): + actual = json.loads(prediction) + else: + actual = prediction + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in prediction: {str(e)}") + + results: Dict[str, Any] = {} + matches = 0 + + for field_path in fields: + # Support nested fields with dot notation + expected_val = get_nested_value(expected, field_path) + actual_val = get_nested_value(actual, field_path) + + # Exact match comparison (v1 - always exact) + match = expected_val == actual_val + + results[field_path] = 1.0 if match else 0.0 + if match: + matches += 1 + + # Overall score is average of field scores + results["score"] = matches / len(fields) if fields else 0.0 + + return {"outputs": results} + + async def auto_webhook_test( inputs: Dict[str, Any], output: Union[str, Dict[str, Any]], @@ -1987,6 +2128,7 @@ async def auto_semantic_similarity( "auto_exact_match": auto_exact_match, "auto_regex_test": auto_regex_test, "field_match_test": auto_field_match_test, + "json_multi_field_match": auto_json_multi_field_match, "auto_webhook_test": auto_webhook_test, "auto_custom_code_run": auto_custom_code_run, "auto_ai_critique": auto_ai_critique, @@ -2008,6 +2150,7 @@ async def auto_semantic_similarity( "auto_exact_match": exact_match, "auto_regex_test": regex_test, "field_match_test": field_match_test, + "json_multi_field_match": json_multi_field_match, "auto_webhook_test": webhook_test, "auto_custom_code_run": custom_code_run, "auto_ai_critique": ai_critique, From 70f1cb66f54e52d0aadaf2798fb315b3cf15b656 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Tue, 30 Dec 2025 20:20:08 +0100 Subject: [PATCH 03/11] feat(api): implement JSON Multi-Field Match evaluator with dynamic scoring and update configurations --- api/oss/src/core/evaluators/service.py | 22 +++ sdk/agenta/sdk/workflows/configurations.py | 1 + sdk/agenta/sdk/workflows/handlers.py | 149 +++++++++++++++++++++ sdk/agenta/sdk/workflows/interfaces.py | 52 +++++++ sdk/agenta/sdk/workflows/utils.py | 6 + 5 files changed, 230 insertions(+) diff --git a/api/oss/src/core/evaluators/service.py b/api/oss/src/core/evaluators/service.py index 6e547addfe..8a2d38955d 100644 --- a/api/oss/src/core/evaluators/service.py +++ b/api/oss/src/core/evaluators/service.py @@ -1435,11 +1435,33 @@ def _transfer_evaluator_revision_data( else None ) headers = None + # TODO: This function reconstructs output schemas from old evaluator settings. + # When fully migrating to the new workflow-based evaluator system, the output + # schema should be stored directly in the evaluator revision (workflow revision) + # at configuration time, rather than being inferred from settings here. + # For evaluators with dynamic outputs (auto_ai_critique, json_multi_field_match), + # the frontend/API should build and save the complete output schema when the + # user configures the evaluator. outputs_schema = None if str(old_evaluator.evaluator_key) == "auto_ai_critique": json_schema = old_evaluator.settings_values.get("json_schema", None) if json_schema and isinstance(json_schema, dict): outputs_schema = json_schema.get("schema", None) + # Handle json_multi_field_match with dynamic field-based properties + if str(old_evaluator.evaluator_key) == "json_multi_field_match": + # Build dynamic properties based on configured fields + fields = old_evaluator.settings_values.get("fields", []) + properties = {"score": {"type": "number"}} + for field in fields: + # Each field becomes a numeric score (0 or 1) + properties[field] = {"type": "number"} + outputs_schema = { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": properties, + "required": ["score"], + "additionalProperties": False, + } if not outputs_schema: properties = ( {"score": {"type": "number"}, "success": {"type": "boolean"}} diff --git a/sdk/agenta/sdk/workflows/configurations.py b/sdk/agenta/sdk/workflows/configurations.py index 9086047c53..42310b9368 100644 --- a/sdk/agenta/sdk/workflows/configurations.py +++ b/sdk/agenta/sdk/workflows/configurations.py @@ -5,6 +5,7 @@ auto_exact_match_v0_configuration = WorkflowServiceConfiguration() auto_regex_test_v0_configuration = WorkflowServiceConfiguration() field_match_test_v0_configuration = WorkflowServiceConfiguration() +json_multi_field_match_v0_configuration = WorkflowServiceConfiguration() auto_webhook_test_v0_configuration = WorkflowServiceConfiguration() auto_custom_code_run_v0_configuration = WorkflowServiceConfiguration() auto_ai_critique_v0_configuration = WorkflowServiceConfiguration() diff --git a/sdk/agenta/sdk/workflows/handlers.py b/sdk/agenta/sdk/workflows/handlers.py index 7216761897..c628fa21a1 100644 --- a/sdk/agenta/sdk/workflows/handlers.py +++ b/sdk/agenta/sdk/workflows/handlers.py @@ -537,6 +537,155 @@ def field_match_test_v0( return {"success": success} +def _get_nested_value(obj: Any, path: str) -> Any: + """ + Get value from nested dict using dot notation path. + + Args: + obj: The object to traverse (dict or list) + path: Dot-separated path like "user.address.city" or "items.0.name" + + Returns: + The value at the path, or None if path doesn't exist + """ + if obj is None: + return None + + keys = path.split(".") + value = obj + + for key in keys: + if isinstance(value, dict): + value = value.get(key) + elif isinstance(value, list) and key.isdigit(): + idx = int(key) + value = value[idx] if 0 <= idx < len(value) else None + else: + return None + + if value is None: + return None + + return value + + +@instrument(annotate=True) +def json_multi_field_match_v0( + parameters: Optional[Data] = None, + inputs: Optional[Data] = None, + outputs: Optional[Union[Data, str]] = None, +) -> Any: + """ + Multi-field JSON match evaluator for comparing multiple fields between expected and actual JSON. + + Each configured field becomes a separate score (0 or 1), and an overall score shows + the ratio of matching fields. Useful for entity extraction validation. + + Args: + inputs: Testcase data with ground truth JSON + outputs: Output from the workflow execution (expected to be JSON string or dict) + parameters: Configuration with: + - fields: List of field paths to compare (e.g., ["name", "user.address.city"]) + - correct_answer_key: Key in inputs containing the expected JSON + + Returns: + Dict with per-field scores and overall score, e.g.: + {"name": 1.0, "email": 0.0, "score": 0.5} + """ + if parameters is None or not isinstance(parameters, dict): + raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) + + if "fields" not in parameters: + raise MissingConfigurationParameterV0Error(path="fields") + + fields = parameters["fields"] + + if not isinstance(fields, list) or len(fields) == 0: + raise InvalidConfigurationParameterV0Error( + path="fields", + expected="non-empty list", + got=fields, + ) + + if "correct_answer_key" not in parameters: + raise MissingConfigurationParameterV0Error(path="correct_answer_key") + + correct_answer_key = str(parameters["correct_answer_key"]) + + if inputs is None or not isinstance(inputs, dict): + raise InvalidInputsV0Error(expected="dict", got=inputs) + + if correct_answer_key not in inputs: + raise MissingInputV0Error(path=correct_answer_key) + + correct_answer = inputs[correct_answer_key] + + # Parse ground truth JSON + if isinstance(correct_answer, str): + try: + expected = json.loads(correct_answer) + except json.JSONDecodeError as e: + raise InvalidInputV0Error( + path=correct_answer_key, + expected="valid JSON string", + got=correct_answer, + ) + elif isinstance(correct_answer, dict): + expected = correct_answer + else: + raise InvalidInputV0Error( + path=correct_answer_key, + expected=["dict", "str"], + got=correct_answer, + ) + + # Parse output JSON + if not isinstance(outputs, str) and not isinstance(outputs, dict): + # Return all zeros if output is invalid + results: Dict[str, Any] = {field: 0.0 for field in fields} + results["score"] = 0.0 + return results + + if isinstance(outputs, str): + try: + actual = json.loads(outputs) + except json.JSONDecodeError: + # Return all zeros if output is not valid JSON + results = {field: 0.0 for field in fields} + results["score"] = 0.0 + return results + else: + actual = outputs + + if not isinstance(actual, dict): + # Return all zeros if parsed output is not a dict + results = {field: 0.0 for field in fields} + results["score"] = 0.0 + return results + + # -------------------------------------------------------------------------- + # Compare each configured field + results = {} + matches = 0 + + for field_path in fields: + expected_val = _get_nested_value(expected, field_path) + actual_val = _get_nested_value(actual, field_path) + + # Exact match comparison + match = expected_val == actual_val + + results[field_path] = 1.0 if match else 0.0 + if match: + matches += 1 + + # Overall score is ratio of matching fields + results["score"] = matches / len(fields) if fields else 0.0 + # -------------------------------------------------------------------------- + + return results + + @instrument(annotate=True) async def auto_webhook_test_v0( parameters: Optional[Data] = None, diff --git a/sdk/agenta/sdk/workflows/interfaces.py b/sdk/agenta/sdk/workflows/interfaces.py index 85334ab6cb..d9a8425d80 100644 --- a/sdk/agenta/sdk/workflows/interfaces.py +++ b/sdk/agenta/sdk/workflows/interfaces.py @@ -169,6 +169,58 @@ ), ) +json_multi_field_match_v0_interface = WorkflowServiceInterface( + uri="agenta:built-in:json_multi_field_match:v0", + schemas=dict( # type: ignore + parameters={ + "type": "object", + "title": "JSON Multi-Field Match Parameters", + "description": "Settings for comparing multiple JSON fields against expected values from a ground truth column.", + "properties": { + "correct_answer_key": { + "type": "string", + "title": "Ground Truth Column", + "description": "Column in test data containing the JSON ground truth.", + "default": "correct_answer", + }, + "fields": { + "type": "array", + "title": "Fields to Compare", + "description": "List of JSON field paths (dot notation) to compare. Each field becomes a separate score.", + "items": {"type": "string"}, + "default": [], + }, + }, + "required": ["correct_answer_key", "fields"], + "additionalProperties": False, + }, + inputs={ + "type": "object", + "title": "JSON Multi-Field Match Inputs", + "description": "Testcase data including the JSON ground truth.", + }, + outputs={ + "type": "object", + "title": "JSON Multi-Field Match Outputs", + "description": "Per-field match scores and overall match ratio. Each field produces a score_ output (0 or 1).", + "properties": { + "score": { + "type": "number", + "title": "Overall Score", + "description": "Ratio of matched fields (0-1).", + }, + "success": { + "type": "boolean", + "title": "Success", + "description": "True if all selected fields matched.", + }, + }, + "required": ["score", "success"], + "additionalProperties": True, # Allows dynamic score_ outputs + }, + ), +) + auto_webhook_test_v0_interface = WorkflowServiceInterface( uri="agenta:built-in:auto_webhook_test:v0", schemas=dict( # type: ignore diff --git a/sdk/agenta/sdk/workflows/utils.py b/sdk/agenta/sdk/workflows/utils.py index d86f499da4..2ecd57d219 100644 --- a/sdk/agenta/sdk/workflows/utils.py +++ b/sdk/agenta/sdk/workflows/utils.py @@ -9,6 +9,7 @@ auto_exact_match_v0, auto_regex_test_v0, field_match_test_v0, + json_multi_field_match_v0, auto_webhook_test_v0, auto_custom_code_run_v0, auto_ai_critique_v0, @@ -31,6 +32,7 @@ auto_exact_match_v0_interface, auto_regex_test_v0_interface, field_match_test_v0_interface, + json_multi_field_match_v0_interface, auto_webhook_test_v0_interface, auto_custom_code_run_v0_interface, auto_ai_critique_v0_interface, @@ -54,6 +56,7 @@ auto_exact_match_v0_configuration, auto_regex_test_v0_configuration, field_match_test_v0_configuration, + json_multi_field_match_v0_configuration, auto_webhook_test_v0_configuration, auto_custom_code_run_v0_configuration, auto_ai_critique_v0_configuration, @@ -78,6 +81,7 @@ auto_exact_match=dict(v0=auto_exact_match_v0_interface), auto_regex_test=dict(v0=auto_regex_test_v0_interface), field_match_test=dict(v0=field_match_test_v0_interface), + json_multi_field_match=dict(v0=json_multi_field_match_v0_interface), auto_webhook_test=dict(v0=auto_webhook_test_v0_interface), auto_custom_code_run=dict(v0=auto_custom_code_run_v0_interface), auto_ai_critique=dict(v0=auto_ai_critique_v0_interface), @@ -104,6 +108,7 @@ auto_exact_match=dict(v0=auto_exact_match_v0_configuration), auto_regex_test=dict(v0=auto_regex_test_v0_configuration), field_match_test=dict(v0=field_match_test_v0_configuration), + json_multi_field_match=dict(v0=json_multi_field_match_v0_configuration), auto_webhook_test=dict(v0=auto_webhook_test_v0_configuration), auto_custom_code_run=dict(v0=auto_custom_code_run_v0_configuration), auto_ai_critique=dict(v0=auto_ai_critique_v0_configuration), @@ -160,6 +165,7 @@ auto_exact_match=dict(v0=auto_exact_match_v0), auto_regex_test=dict(v0=auto_regex_test_v0), field_match_test=dict(v0=field_match_test_v0), + json_multi_field_match=dict(v0=json_multi_field_match_v0), auto_webhook_test=dict(v0=auto_webhook_test_v0), auto_custom_code_run=dict(v0=auto_custom_code_run_v0), auto_ai_critique=dict(v0=auto_ai_critique_v0), From 7c32857f1872141cbdd2138b351f3c0b86633e7c Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Tue, 30 Dec 2025 20:20:45 +0100 Subject: [PATCH 04/11] feat(api): add json_multi_field_match evaluator and corresponding icon mapping --- web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts | 1 + web/oss/src/services/evaluators/index.ts | 1 + 2 files changed, 2 insertions(+) diff --git a/web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts b/web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts index ec01f427c4..ff617479c5 100644 --- a/web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts +++ b/web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts @@ -20,6 +20,7 @@ export const ENABLED_EVALUATORS = [ "auto_semantic_similarity", "auto_regex_test", "field_match_test", + "json_multi_field_match", "auto_json_diff", "auto_ai_critique", "auto_custom_code_run", diff --git a/web/oss/src/services/evaluators/index.ts b/web/oss/src/services/evaluators/index.ts index 8654f1e59f..e0f648fad8 100644 --- a/web/oss/src/services/evaluators/index.ts +++ b/web/oss/src/services/evaluators/index.ts @@ -67,6 +67,7 @@ const evaluatorIconsMap = { auto_similarity_match: similarityImg, auto_regex_test: regexImg, field_match_test: exactMatchImg, + json_multi_field_match: bracketCurlyImg, auto_webhook_test: webhookImg, auto_ai_critique: aiImg, auto_custom_code_run: codeImg, From 64f5c1435d7465afe9f9a2d5f09ea559c9bfa4ac Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Tue, 30 Dec 2025 20:54:17 +0100 Subject: [PATCH 05/11] refactor(api): update json_multi_field_match evaluator to use aggregate_score instead of score and enhance field comparison descriptions --- api/oss/src/core/evaluators/service.py | 4 ++-- api/oss/src/resources/evaluators/evaluators.py | 6 +++--- sdk/agenta/sdk/workflows/handlers.py | 18 +++++++++--------- sdk/agenta/sdk/workflows/interfaces.py | 17 ++++++----------- 4 files changed, 20 insertions(+), 25 deletions(-) diff --git a/api/oss/src/core/evaluators/service.py b/api/oss/src/core/evaluators/service.py index 8a2d38955d..076e82439a 100644 --- a/api/oss/src/core/evaluators/service.py +++ b/api/oss/src/core/evaluators/service.py @@ -1451,7 +1451,7 @@ def _transfer_evaluator_revision_data( if str(old_evaluator.evaluator_key) == "json_multi_field_match": # Build dynamic properties based on configured fields fields = old_evaluator.settings_values.get("fields", []) - properties = {"score": {"type": "number"}} + properties = {"aggregate_score": {"type": "number"}} for field in fields: # Each field becomes a numeric score (0 or 1) properties[field] = {"type": "number"} @@ -1459,7 +1459,7 @@ def _transfer_evaluator_revision_data( "$schema": "https://json-schema.org/draft/2020-12/schema", "type": "object", "properties": properties, - "required": ["score"], + "required": ["aggregate_score"], "additionalProperties": False, } if not outputs_schema: diff --git a/api/oss/src/resources/evaluators/evaluators.py b/api/oss/src/resources/evaluators/evaluators.py index c6b2b189d9..d4f4965f7a 100644 --- a/api/oss/src/resources/evaluators/evaluators.py +++ b/api/oss/src/resources/evaluators/evaluators.py @@ -363,9 +363,9 @@ "settings_template": { "fields": { "label": "Fields to Compare", - "type": "fields_checkbox_list", # Custom type - checkbox list with auto-detection from testcase + "type": "fields_tags_editor", # Custom type - tag-based add/remove editor "required": True, - "description": "Select which JSON fields to compare (auto-detected from testcase)", + "description": "Add fields to compare using dot notation for nested paths (e.g., user.name)", }, "correct_answer_key": { "label": "Expected Answer Column", @@ -377,7 +377,7 @@ "advanced": True, # Hidden in advanced section }, }, - "description": "Compares configured fields in expected JSON against LLM output. Each field becomes a separate score column (0 or 1), with an overall score showing the match ratio. Useful for entity extraction validation.", + "description": "Compares configured fields in expected JSON against LLM output. Each field becomes a separate metric (0 or 1), with an aggregate_score showing the percentage of matching fields. Useful for entity extraction validation.", "requires_testcase": "always", "requires_trace": "always", "oss": True, diff --git a/sdk/agenta/sdk/workflows/handlers.py b/sdk/agenta/sdk/workflows/handlers.py index c628fa21a1..74831ddb2b 100644 --- a/sdk/agenta/sdk/workflows/handlers.py +++ b/sdk/agenta/sdk/workflows/handlers.py @@ -578,8 +578,8 @@ def json_multi_field_match_v0( """ Multi-field JSON match evaluator for comparing multiple fields between expected and actual JSON. - Each configured field becomes a separate score (0 or 1), and an overall score shows - the ratio of matching fields. Useful for entity extraction validation. + Each configured field becomes a separate score (0 or 1), and an aggregate_score shows + the percentage of matching fields. Useful for entity extraction validation. Args: inputs: Testcase data with ground truth JSON @@ -589,8 +589,8 @@ def json_multi_field_match_v0( - correct_answer_key: Key in inputs containing the expected JSON Returns: - Dict with per-field scores and overall score, e.g.: - {"name": 1.0, "email": 0.0, "score": 0.5} + Dict with per-field scores and aggregate_score, e.g.: + {"name": 1.0, "email": 0.0, "aggregate_score": 0.5} """ if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) @@ -643,7 +643,7 @@ def json_multi_field_match_v0( if not isinstance(outputs, str) and not isinstance(outputs, dict): # Return all zeros if output is invalid results: Dict[str, Any] = {field: 0.0 for field in fields} - results["score"] = 0.0 + results["aggregate_score"] = 0.0 return results if isinstance(outputs, str): @@ -652,7 +652,7 @@ def json_multi_field_match_v0( except json.JSONDecodeError: # Return all zeros if output is not valid JSON results = {field: 0.0 for field in fields} - results["score"] = 0.0 + results["aggregate_score"] = 0.0 return results else: actual = outputs @@ -660,7 +660,7 @@ def json_multi_field_match_v0( if not isinstance(actual, dict): # Return all zeros if parsed output is not a dict results = {field: 0.0 for field in fields} - results["score"] = 0.0 + results["aggregate_score"] = 0.0 return results # -------------------------------------------------------------------------- @@ -679,8 +679,8 @@ def json_multi_field_match_v0( if match: matches += 1 - # Overall score is ratio of matching fields - results["score"] = matches / len(fields) if fields else 0.0 + # Aggregate score is the percentage of matching fields + results["aggregate_score"] = matches / len(fields) if fields else 0.0 # -------------------------------------------------------------------------- return results diff --git a/sdk/agenta/sdk/workflows/interfaces.py b/sdk/agenta/sdk/workflows/interfaces.py index d9a8425d80..6c1e5edfbf 100644 --- a/sdk/agenta/sdk/workflows/interfaces.py +++ b/sdk/agenta/sdk/workflows/interfaces.py @@ -202,21 +202,16 @@ outputs={ "type": "object", "title": "JSON Multi-Field Match Outputs", - "description": "Per-field match scores and overall match ratio. Each field produces a score_ output (0 or 1).", + "description": "Per-field match scores and aggregate score. Each field produces a 0 or 1 output.", "properties": { - "score": { + "aggregate_score": { "type": "number", - "title": "Overall Score", - "description": "Ratio of matched fields (0-1).", - }, - "success": { - "type": "boolean", - "title": "Success", - "description": "True if all selected fields matched.", + "title": "Aggregate Score", + "description": "Percentage of matched fields (0-1).", }, }, - "required": ["score", "success"], - "additionalProperties": True, # Allows dynamic score_ outputs + "required": ["aggregate_score"], + "additionalProperties": True, # Allows dynamic field outputs }, ), ) From 4a3065eb74b187c488c2e0e62a70c7383ced82fa Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Tue, 30 Dec 2025 20:54:44 +0100 Subject: [PATCH 06/11] feat(frontend): add FieldsTagsEditor component for managing JSON field paths in evaluations --- .../ConfigureEvaluator/DynamicFormField.tsx | 3 + .../ConfigureEvaluator/FieldsTagsEditor.tsx | 283 ++++++++++++++++++ web/oss/src/lib/Types.ts | 1 + web/oss/src/lib/helpers/extractJsonPaths.ts | 86 ++++++ 4 files changed, 373 insertions(+) create mode 100644 web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx create mode 100644 web/oss/src/lib/helpers/extractJsonPaths.ts diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx index 3098026f1b..85f09fd173 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx @@ -11,6 +11,7 @@ import {isValidRegex} from "@/oss/lib/helpers/validators" import {generatePaths} from "@/oss/lib/transformers" import {EvaluationSettingsTemplate, JSSTheme} from "@/oss/lib/Types" +import {FieldsTagsEditor} from "./FieldsTagsEditor" import {JSONSchemaEditor} from "./JSONSchema" import {Messages} from "./Messages" @@ -215,6 +216,8 @@ export const DynamicFormField: React.FC = ({ : JSON.stringify(savedValue ?? {}, null, 2) } /> + ) : type === "fields_tags_editor" ? ( + ) : null} )} diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx new file mode 100644 index 0000000000..712bedd111 --- /dev/null +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx @@ -0,0 +1,283 @@ +/** + * FieldsTagsEditor - Tag-based editor for JSON field paths + * + * This component provides an add/remove interface for managing JSON field paths. + * Users can: + * - Add fields manually using an input field (supports dot notation for nested paths) + * - Remove fields by clicking the X button on tags + * - Detect fields from the selected testcase using a dedicated button + * + * The component also displays a non-removable "overall" field representing + * the aggregate result across all fields. + * + * Auto-detection behavior: + * - When a testcase is loaded and no fields are configured, fields are auto-detected + */ + +import {useCallback, useEffect, useMemo, useRef, useState} from "react" + +import {CloseOutlined, PlusOutlined, SearchOutlined} from "@ant-design/icons" +import {Button, Input, Space, Tag, theme, Tooltip, Typography} from "antd" +import type {FormInstance} from "antd/es/form" +import {useAtomValue} from "jotai" +import {createUseStyles} from "react-jss" + +import {extractJsonPaths, safeParseJson} from "@/oss/lib/helpers/extractJsonPaths" +import type {JSSTheme} from "@/oss/lib/Types" + +import {playgroundSelectedTestcaseAtom} from "./state/atoms" + +const {Text} = Typography + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + container: { + display: "flex", + flexDirection: "column", + gap: 12, + }, + tagsContainer: { + display: "flex", + flexWrap: "wrap", + gap: 8, + padding: 12, + borderRadius: 6, + border: `1px solid ${theme.colorBorder}`, + backgroundColor: theme.colorBgContainer, + minHeight: 48, + }, + fieldTag: { + display: "flex", + alignItems: "center", + fontFamily: "monospace", + fontSize: 13, + margin: 0, + }, + matchRatioTag: { + fontFamily: "monospace", + fontSize: 13, + margin: 0, + fontWeight: 500, + }, + addFieldRow: { + display: "flex", + gap: 8, + }, + addInput: { + flex: 1, + fontFamily: "monospace", + }, + actionsRow: { + display: "flex", + alignItems: "center", + justifyContent: "space-between", + }, + helperText: { + fontSize: 12, + color: theme.colorTextSecondary, + }, + emptyMessage: { + color: theme.colorTextSecondary, + fontSize: 13, + }, +})) + +interface FieldsTagsEditorProps { + value?: string[] + onChange?: (value: string[]) => void + form?: FormInstance + name?: string | string[] + correctAnswerKey?: string +} + +/** + * Tag-based editor for managing JSON field paths with add/remove functionality. + * Includes "Detect from testcase" feature to auto-populate fields. + */ +export const FieldsTagsEditor: React.FC = ({ + value = [], + onChange, + form, + correctAnswerKey = "correct_answer", +}) => { + const classes = useStyles() + const {token} = theme.useToken() + const [inputValue, setInputValue] = useState("") + // Track if we've already auto-detected to avoid re-triggering + const hasAutoDetectedRef = useRef(false) + + // Read the selected testcase from the playground atom + const testcaseSelection = useAtomValue(playgroundSelectedTestcaseAtom) + const testcase = testcaseSelection?.testcase + + // Get the correct_answer_key from form if available + const formCorrectAnswerKey = form?.getFieldValue(["settings_values", "correct_answer_key"]) + const effectiveKey = formCorrectAnswerKey || correctAnswerKey + + // Check if we can detect fields from testcase + const canDetectFields = useMemo(() => { + if (!testcase) return false + const groundTruthValue = testcase[effectiveKey] + if (!groundTruthValue) return false + const parsed = safeParseJson(groundTruthValue) + return parsed !== null + }, [testcase, effectiveKey]) + + // Extract available fields from the testcase + const detectableFields = useMemo(() => { + if (!testcase) return [] + const groundTruthValue = testcase[effectiveKey] + if (!groundTruthValue) return [] + const parsed = safeParseJson(groundTruthValue) + if (!parsed) return [] + return extractJsonPaths(parsed) + }, [testcase, effectiveKey]) + + // Auto-detect fields when testcase is loaded and no fields are configured + useEffect(() => { + // Only auto-detect if: + // 1. We haven't already auto-detected + // 2. There are no user-defined fields + // 3. We can detect fields from the testcase + if (!hasAutoDetectedRef.current && value.length === 0 && detectableFields.length > 0) { + hasAutoDetectedRef.current = true + onChange?.(detectableFields) + } + }, [detectableFields, value.length, onChange]) + + // Handle adding a new field + const handleAddField = useCallback(() => { + const trimmed = inputValue.trim() + if (!trimmed) return + + // Don't add duplicates + if (value.includes(trimmed)) { + setInputValue("") + return + } + + // Don't allow reserved field names + if (trimmed === "aggregate_score") { + setInputValue("") + return + } + + onChange?.([...value, trimmed]) + setInputValue("") + }, [inputValue, value, onChange]) + + // Handle removing a field + const handleRemoveField = useCallback( + (fieldToRemove: string) => { + onChange?.(value.filter((f) => f !== fieldToRemove)) + }, + [value, onChange], + ) + + // Handle detecting fields from testcase (replaces existing fields) + const handleDetectFields = useCallback(() => { + if (detectableFields.length > 0) { + onChange?.(detectableFields) + } + }, [detectableFields, onChange]) + + // Handle Enter key in input + const handleInputKeyDown = useCallback( + (e: React.KeyboardEvent) => { + if (e.key === "Enter") { + e.preventDefault() + handleAddField() + } + }, + [handleAddField], + ) + + // Generate tooltip for disabled detect button + const detectButtonTooltip = useMemo(() => { + if (!testcase) { + return "Select a testcase first to detect fields" + } + if (!canDetectFields) { + return `No JSON object found in the "${effectiveKey}" column` + } + return `Detect ${detectableFields.length} field(s) from testcase (replaces current fields)` + }, [testcase, canDetectFields, effectiveKey, detectableFields.length]) + + return ( +
+ {/* Field Tags Display */} +
+ {/* Non-removable aggregate_score tag */} + + + aggregate_score + + + + {/* User-defined field tags */} + {value.map((field) => ( + handleRemoveField(field)} + className={classes.fieldTag} + > + {field} + + ))} + + {/* Empty state message */} + {value.length === 0 && ( + + Add fields to compare or detect them from a testcase + + )} +
+ + {/* Add Field Input */} +
+ setInputValue(e.target.value)} + onKeyDown={handleInputKeyDown} + suffix={ + + + ? + + + } + /> + +
+ + {/* Actions Row */} +
+ + Each field creates a column with value 0 (no match) or 1 (match) + + + + + +
+
+ ) +} + +export default FieldsTagsEditor diff --git a/web/oss/src/lib/Types.ts b/web/oss/src/lib/Types.ts index c33b0ea498..4ca2ae5555 100644 --- a/web/oss/src/lib/Types.ts +++ b/web/oss/src/lib/Types.ts @@ -990,6 +990,7 @@ type ValueTypeOptions = | "messages" | "multiple_choice" | "llm_response_schema" + | "fields_checkbox_list" export interface EvaluationSettingsTemplate { type: ValueTypeOptions diff --git a/web/oss/src/lib/helpers/extractJsonPaths.ts b/web/oss/src/lib/helpers/extractJsonPaths.ts new file mode 100644 index 0000000000..62176319e5 --- /dev/null +++ b/web/oss/src/lib/helpers/extractJsonPaths.ts @@ -0,0 +1,86 @@ +/** + * Utility functions for extracting JSON paths from objects. + * Used by the JSON Multi-Field Match evaluator to auto-detect fields from testcase data. + */ + +/** + * Recursively extracts all leaf paths from a JSON object using dot notation. + * + * Example: + * Input: {user: {name: "John", address: {city: "NYC"}}} + * Output: ["user.name", "user.address.city"] + * + * @param obj - The object to extract paths from + * @param prefix - Current path prefix (used for recursion) + * @returns Array of dot-notation paths to all leaf values + */ +export const extractJsonPaths = (obj: unknown, prefix = ""): string[] => { + if (obj === null || obj === undefined) return [] + if (typeof obj !== "object") return prefix ? [prefix] : [] + + // For arrays, we don't expand individual indices - just mark the path + // This keeps the UI manageable and matches common use cases + if (Array.isArray(obj)) { + return prefix ? [prefix] : [] + } + + const paths: string[] = [] + + for (const key of Object.keys(obj as Record)) { + const newPrefix = prefix ? `${prefix}.${key}` : key + const value = (obj as Record)[key] + + if (value !== null && typeof value === "object" && !Array.isArray(value)) { + // Recurse into nested objects + paths.push(...extractJsonPaths(value, newPrefix)) + } else { + // Leaf node (primitive, array, or null) + paths.push(newPrefix) + } + } + + return paths +} + +/** + * Parses a JSON string and extracts all paths. + * Returns empty array if parsing fails. + * + * @param jsonString - JSON string to parse and extract paths from + * @returns Array of dot-notation paths + */ +export const extractJsonPathsFromString = (jsonString: string): string[] => { + try { + const parsed = JSON.parse(jsonString) + return extractJsonPaths(parsed) + } catch { + return [] + } +} + +/** + * Safely parses a value that might be JSON string or already an object. + * + * @param value - Value to parse (string or object) + * @returns Parsed object or null if invalid + */ +export const safeParseJson = (value: unknown): Record | null => { + if (value === null || value === undefined) return null + + if (typeof value === "object" && !Array.isArray(value)) { + return value as Record + } + + if (typeof value === "string") { + try { + const parsed = JSON.parse(value) + if (typeof parsed === "object" && parsed !== null && !Array.isArray(parsed)) { + return parsed + } + } catch { + return null + } + } + + return null +} From d47fc55fc84e2d51c3f3d82490a2e1693661dd26 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Tue, 30 Dec 2025 21:03:12 +0100 Subject: [PATCH 07/11] refactor(frontend): simplify FieldsTagsEditor component by removing unused styles and optimizing class names --- .../ConfigureEvaluator/FieldsTagsEditor.tsx | 80 +++---------------- 1 file changed, 12 insertions(+), 68 deletions(-) diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx index 712bedd111..c446c95d4b 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx @@ -16,71 +16,17 @@ import {useCallback, useEffect, useMemo, useRef, useState} from "react" -import {CloseOutlined, PlusOutlined, SearchOutlined} from "@ant-design/icons" -import {Button, Input, Space, Tag, theme, Tooltip, Typography} from "antd" +import {PlusOutlined, SearchOutlined} from "@ant-design/icons" +import {Button, Input, Tag, Tooltip, Typography} from "antd" import type {FormInstance} from "antd/es/form" import {useAtomValue} from "jotai" -import {createUseStyles} from "react-jss" import {extractJsonPaths, safeParseJson} from "@/oss/lib/helpers/extractJsonPaths" -import type {JSSTheme} from "@/oss/lib/Types" import {playgroundSelectedTestcaseAtom} from "./state/atoms" const {Text} = Typography -const useStyles = createUseStyles((theme: JSSTheme) => ({ - container: { - display: "flex", - flexDirection: "column", - gap: 12, - }, - tagsContainer: { - display: "flex", - flexWrap: "wrap", - gap: 8, - padding: 12, - borderRadius: 6, - border: `1px solid ${theme.colorBorder}`, - backgroundColor: theme.colorBgContainer, - minHeight: 48, - }, - fieldTag: { - display: "flex", - alignItems: "center", - fontFamily: "monospace", - fontSize: 13, - margin: 0, - }, - matchRatioTag: { - fontFamily: "monospace", - fontSize: 13, - margin: 0, - fontWeight: 500, - }, - addFieldRow: { - display: "flex", - gap: 8, - }, - addInput: { - flex: 1, - fontFamily: "monospace", - }, - actionsRow: { - display: "flex", - alignItems: "center", - justifyContent: "space-between", - }, - helperText: { - fontSize: 12, - color: theme.colorTextSecondary, - }, - emptyMessage: { - color: theme.colorTextSecondary, - fontSize: 13, - }, -})) - interface FieldsTagsEditorProps { value?: string[] onChange?: (value: string[]) => void @@ -99,8 +45,6 @@ export const FieldsTagsEditor: React.FC = ({ form, correctAnswerKey = "correct_answer", }) => { - const classes = useStyles() - const {token} = theme.useToken() const [inputValue, setInputValue] = useState("") // Track if we've already auto-detected to avoid re-triggering const hasAutoDetectedRef = useRef(false) @@ -203,12 +147,12 @@ export const FieldsTagsEditor: React.FC = ({ }, [testcase, canDetectFields, effectiveKey, detectableFields.length]) return ( -
+
{/* Field Tags Display */} -
+
{/* Non-removable aggregate_score tag */} - + aggregate_score @@ -219,7 +163,7 @@ export const FieldsTagsEditor: React.FC = ({ key={field} closable onClose={() => handleRemoveField(field)} - className={classes.fieldTag} + className="flex items-center font-mono text-[13px] !m-0" > {field} @@ -227,23 +171,23 @@ export const FieldsTagsEditor: React.FC = ({ {/* Empty state message */} {value.length === 0 && ( - + Add fields to compare or detect them from a testcase )}
{/* Add Field Input */} -
+
setInputValue(e.target.value)} onKeyDown={handleInputKeyDown} suffix={ - + ? @@ -259,8 +203,8 @@ export const FieldsTagsEditor: React.FC = ({
{/* Actions Row */} -
- +
+ Each field creates a column with value 0 (no match) or 1 (match) From 0df3ddc1c744563be308d678c2a6213f65f52e8c Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Tue, 30 Dec 2025 21:03:23 +0100 Subject: [PATCH 08/11] refactor(api): clean up unused imports and improve parameter checks in workflow handlers --- api/oss/src/core/evaluators/service.py | 10 ---- sdk/agenta/sdk/workflows/handlers.py | 71 ++++++++++++-------------- 2 files changed, 33 insertions(+), 48 deletions(-) diff --git a/api/oss/src/core/evaluators/service.py b/api/oss/src/core/evaluators/service.py index 076e82439a..5a2b97250c 100644 --- a/api/oss/src/core/evaluators/service.py +++ b/api/oss/src/core/evaluators/service.py @@ -1,13 +1,9 @@ from typing import Optional, List from uuid import UUID, uuid4 -from json import loads from oss.src.utils.helpers import get_slug_from_name_and_id from oss.src.services.db_manager import fetch_evaluator_config from oss.src.core.workflows.dtos import ( - WorkflowFlags, - WorkflowQueryFlags, - # WorkflowCreate, WorkflowEdit, WorkflowQuery, @@ -17,8 +13,6 @@ WorkflowVariantEdit, WorkflowVariantQuery, # - WorkflowRevisionData, - # WorkflowRevisionCreate, WorkflowRevisionEdit, WorkflowRevisionCommit, @@ -35,11 +29,7 @@ SimpleEvaluatorEdit, SimpleEvaluatorQuery, SimpleEvaluatorFlags, - SimpleEvaluatorQueryFlags, - # EvaluatorFlags, - EvaluatorQueryFlags, - # Evaluator, EvaluatorQuery, EvaluatorRevisionsLog, diff --git a/sdk/agenta/sdk/workflows/handlers.py b/sdk/agenta/sdk/workflows/handlers.py index 74831ddb2b..dc2e7cd20e 100644 --- a/sdk/agenta/sdk/workflows/handlers.py +++ b/sdk/agenta/sdk/workflows/handlers.py @@ -1,4 +1,4 @@ -from typing import List, Any, Optional, Any, Dict, Union +from typing import List, Optional, Any, Dict, Union from json import dumps, loads import traceback import json @@ -22,7 +22,6 @@ from agenta.sdk.decorators.tracing import instrument from agenta.sdk.models.shared import Data -from agenta.sdk.models.tracing import Trace from agenta.sdk.workflows.sandbox import execute_code_safely from agenta.sdk.workflows.errors import ( InvalidConfigurationParametersV0Error, @@ -32,7 +31,6 @@ MissingInputV0Error, InvalidInputV0Error, InvalidOutputsV0Error, - MissingOutputV0Error, InvalidSecretsV0Error, JSONDiffV0Error, LevenshteinDistanceV0Error, @@ -46,7 +44,6 @@ PromptCompletionV0Error, ) -from agenta.sdk.litellm import mockllm from agenta.sdk.litellm.litellm import litellm_handler litellm.logging = False @@ -76,9 +73,7 @@ def _compute_similarity(embedding_1: List[float], embedding_2: List[float]) -> f return dot / (norm1 * norm2) -import json -import re -from typing import Any, Dict, Iterable, Tuple, Optional +from typing import Any, Iterable, Tuple try: import jsonpath # ✅ use module API @@ -389,7 +384,7 @@ def auto_exact_match_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -397,7 +392,7 @@ def auto_exact_match_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -434,7 +429,7 @@ def auto_regex_test_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "regex_pattern" in parameters: + if "regex_pattern" not in parameters: raise MissingConfigurationParameterV0Error(path="regex_pattern") regex_pattern = parameters["regex_pattern"] @@ -492,12 +487,12 @@ def field_match_test_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "json_field" in parameters: + if "json_field" not in parameters: raise MissingConfigurationParameterV0Error(path="json_field") json_field = str(parameters["json_field"]) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -505,7 +500,7 @@ def field_match_test_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -518,7 +513,7 @@ def field_match_test_v0( if isinstance(outputs, str): try: outputs_dict = loads(outputs) - except json.JSONDecodeError as e: + except json.JSONDecodeError: # raise InvalidOutputsV0Error(expected="dict", got=outputs) from e return {"success": False} @@ -526,7 +521,7 @@ def field_match_test_v0( # raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs) return {"success": False} - if not json_field in outputs_dict: + if json_field not in outputs_dict: # raise MissingOutputV0Error(path=json_field) return {"success": False} @@ -624,7 +619,7 @@ def json_multi_field_match_v0( if isinstance(correct_answer, str): try: expected = json.loads(correct_answer) - except json.JSONDecodeError as e: + except json.JSONDecodeError: raise InvalidInputV0Error( path=correct_answer_key, expected="valid JSON string", @@ -706,12 +701,12 @@ async def auto_webhook_test_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "webhook_url" in parameters: + if "webhook_url" not in parameters: raise MissingConfigurationParameterV0Error(path="webhook_url") webhook_url = str(parameters["webhook_url"]) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -719,7 +714,7 @@ async def auto_webhook_test_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -811,12 +806,12 @@ async def auto_custom_code_run_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "code" in parameters: + if "code" not in parameters: raise MissingConfigurationParameterV0Error(path="code") code = str(parameters["code"]) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -824,7 +819,7 @@ async def auto_custom_code_run_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -902,7 +897,7 @@ async def auto_ai_critique_v0( correct_answer_key = parameters.get("correct_answer_key") - if not "prompt_template" in parameters: + if "prompt_template" not in parameters: raise MissingConfigurationParameterV0Error(path="prompt_template") prompt_template = parameters.get("prompt_template") @@ -933,7 +928,7 @@ async def auto_ai_critique_v0( "json_schema" if template_version == "4" else "text" ) - if not response_type in ["text", "json_object", "json_schema"]: + if response_type not in ["text", "json_object", "json_schema"]: raise InvalidConfigurationParameterV0Error( path="response_type", expected=["text", "json_object", "json_schema"], @@ -1135,7 +1130,7 @@ def auto_starts_with_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "prefix" in parameters: + if "prefix" not in parameters: raise MissingConfigurationParameterV0Error(path="prefix") prefix = parameters["prefix"] @@ -1184,7 +1179,7 @@ def auto_ends_with_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "suffix" in parameters: + if "suffix" not in parameters: raise MissingConfigurationParameterV0Error(path="suffix") suffix = parameters["suffix"] @@ -1233,7 +1228,7 @@ def auto_contains_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "substring" in parameters: + if "substring" not in parameters: raise MissingConfigurationParameterV0Error(path="substring") substring = parameters["substring"] @@ -1282,7 +1277,7 @@ def auto_contains_any_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "substrings" in parameters: + if "substrings" not in parameters: raise MissingConfigurationParameterV0Error(path="substrings") substrings = parameters["substrings"] @@ -1340,7 +1335,7 @@ def auto_contains_all_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "substrings" in parameters: + if "substrings" not in parameters: raise MissingConfigurationParameterV0Error(path="substrings") substrings = parameters["substrings"] @@ -1440,7 +1435,7 @@ def auto_json_diff_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -1448,7 +1443,7 @@ def auto_json_diff_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -1532,7 +1527,7 @@ def auto_levenshtein_distance_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -1542,7 +1537,7 @@ def auto_levenshtein_distance_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -1637,7 +1632,7 @@ def auto_similarity_match_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -1647,7 +1642,7 @@ def auto_similarity_match_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -1730,7 +1725,7 @@ async def auto_semantic_similarity_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "correct_answer_key" in parameters: + if "correct_answer_key" not in parameters: raise MissingConfigurationParameterV0Error(path="correct_answer_key") correct_answer_key = str(parameters["correct_answer_key"]) @@ -1743,7 +1738,7 @@ async def auto_semantic_similarity_v0( if inputs is None or not isinstance(inputs, dict): raise InvalidInputsV0Error(expected="dict", got=inputs) - if not correct_answer_key in inputs: + if correct_answer_key not in inputs: raise MissingInputV0Error(path=correct_answer_key) correct_answer = inputs[correct_answer_key] @@ -1845,7 +1840,7 @@ async def completion_v0( if parameters is None or not isinstance(parameters, dict): raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters) - if not "prompt" in parameters: + if "prompt" not in parameters: raise MissingConfigurationParameterV0Error(path="prompt") params: Dict[str, Any] = {**(parameters or {})} From 04c087ca84752c43368d84a0c8067fd8dd617226 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Tue, 30 Dec 2025 21:04:43 +0100 Subject: [PATCH 09/11] fix(api): use aggregate_score instead of score in json_multi_field_match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes naming inconsistency between API service and SDK interface schema. The interface defines `aggregate_score` as the required output field, so the service must use the same name to pass schema validation. Also applies ruff auto-cleanup for unused imports. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- api/oss/src/services/evaluators_service.py | 32 ++++++++++------------ 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/api/oss/src/services/evaluators_service.py b/api/oss/src/services/evaluators_service.py index 25505e4029..74902b0a8d 100644 --- a/api/oss/src/services/evaluators_service.py +++ b/api/oss/src/services/evaluators_service.py @@ -253,7 +253,7 @@ async def auto_exact_match( message=str(e), ), ) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -487,8 +487,8 @@ async def json_multi_field_match( if match: matches += 1 - # Overall score is average of field scores - results["score"] = matches / len(fields) if fields else 0.0 + # Aggregate score is the percentage of matching fields + results["aggregate_score"] = matches / len(fields) if fields else 0.0 return {"outputs": results} @@ -576,7 +576,7 @@ async def auto_custom_code_run( ) ) return Result(type="number", value=response["outputs"]["score"]) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -645,7 +645,7 @@ async def auto_ai_critique( ) ) return Result(type="number", value=response["outputs"]["score"]) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -656,9 +656,7 @@ async def auto_ai_critique( ) -import json -import re -from typing import Any, Dict, Iterable, Tuple, Optional +from typing import Any, Dict, Iterable, Tuple try: import jsonpath # ✅ use module API @@ -1295,7 +1293,7 @@ async def auto_starts_with( ) ) return Result(type="bool", value=response["outputs"]["success"]) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1337,7 +1335,7 @@ async def auto_ends_with( ) result = Result(type="bool", value=response["outputs"]["success"]) return result - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1379,7 +1377,7 @@ async def auto_contains( ) result = Result(type="bool", value=response["outputs"]["success"]) return result - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1421,7 +1419,7 @@ async def auto_contains_any( ) result = Result(type="bool", value=response["outputs"]["success"]) return result - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1464,7 +1462,7 @@ async def auto_contains_all( ) result = Result(type="bool", value=response["outputs"]["success"]) return result - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1512,7 +1510,7 @@ async def auto_contains_json( input=EvaluatorInputInterface(**{"inputs": {"prediction": output}}) ) return Result(type="bool", value=response["outputs"]["success"]) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -1530,7 +1528,7 @@ async def contains_json(input: EvaluatorInputInterface) -> EvaluatorOutputInterf potential_json = str(input.inputs["prediction"])[start_index:end_index] json.loads(potential_json) contains_json = True - except (ValueError, json.JSONDecodeError) as e: + except (ValueError, json.JSONDecodeError): contains_json = False return {"outputs": {"success": contains_json}} @@ -1992,7 +1990,7 @@ async def auto_levenshtein_distance( message=str(e), ), ) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, @@ -2032,7 +2030,7 @@ async def auto_similarity_match( message=str(e), ), ) - except Exception as e: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return Result( type="error", value=None, From 325d571690414a8e9ebf19c612098ebaeddd28bd Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Wed, 31 Dec 2025 15:09:24 +0100 Subject: [PATCH 10/11] refactor(api): enhance get_nested_value function to support multiple path formats Updated the get_nested_value function to utilize resolve_any() for improved path resolution, allowing support for dot notation, JSON Path, and JSON Pointer formats. Cleaned up imports and ensured consistent error handling for path resolution failures. --- api/oss/src/services/evaluators_service.py | 59 +++++++--------- sdk/agenta/sdk/workflows/handlers.py | 78 +++++++++------------- 2 files changed, 56 insertions(+), 81 deletions(-) diff --git a/api/oss/src/services/evaluators_service.py b/api/oss/src/services/evaluators_service.py index 33a557dce9..04a0fbfc6b 100644 --- a/api/oss/src/services/evaluators_service.py +++ b/api/oss/src/services/evaluators_service.py @@ -1,34 +1,31 @@ -import re import json +import re import traceback -from typing import Any, Dict, Union, List, Optional +from typing import Any, Dict, List, Optional, Union -import litellm import httpx +import litellm import numpy as np -from openai import AsyncOpenAI +from agenta.sdk.managers.secrets import SecretsManager from fastapi import HTTPException from numpy._core._multiarray_umath import array -# from autoevals.ragas import Faithfulness, ContextRelevancy # Commented out due to autoevals removal - -from oss.src.utils.logging import get_module_logger -from oss.src.services.security import sandbox -from oss.src.models.shared_models import Error, Result +from openai import AsyncOpenAI from oss.src.models.api.evaluation_model import ( EvaluatorInputInterface, - EvaluatorOutputInterface, EvaluatorMappingInputInterface, EvaluatorMappingOutputInterface, + EvaluatorOutputInterface, ) +from oss.src.models.shared_models import Error, Result +from oss.src.services.security import sandbox + +# from autoevals.ragas import Faithfulness, ContextRelevancy # Commented out due to autoevals removal +from oss.src.utils.logging import get_module_logger from oss.src.utils.traces import ( - remove_trace_prefix, - process_distributed_trace_into_trace_tree, get_field_value_from_trace_tree, + process_distributed_trace_into_trace_tree, ) -from agenta.sdk.managers.secrets import SecretsManager - - log = get_module_logger(__name__) @@ -354,35 +351,27 @@ async def field_match_test(input: EvaluatorInputInterface) -> EvaluatorOutputInt def get_nested_value(obj: Any, path: str) -> Any: """ - Get value from nested dict/object using dot notation (e.g., 'user.address.city'). + Get value from nested object using resolve_any() with graceful None on failure. + + Supports multiple path formats: + - Dot notation: "user.address.city", "items.0.name" + - JSON Path: "$.user.address.city", "$.items[0].name" + - JSON Pointer: "/user/address/city", "/items/0/name" Args: obj: The object to traverse (dict or nested structure) - path: Dot-notation path to the value (e.g., 'user.address.city') + path: Path expression in any supported format Returns: - The value at the specified path, or None if path doesn't exist + The value at the specified path, or None if path doesn't exist or resolution fails """ if obj is None: return None - keys = path.split(".") - value = obj - - for key in keys: - if isinstance(value, dict): - value = value.get(key) - elif isinstance(value, list) and key.isdigit(): - # Support array indexing with numeric keys - idx = int(key) - value = value[idx] if 0 <= idx < len(value) else None - else: - return None - - if value is None: - return None - - return value + try: + return resolve_any(path, obj) + except (KeyError, IndexError, ValueError, TypeError, ImportError): + return None async def auto_json_multi_field_match( diff --git a/sdk/agenta/sdk/workflows/handlers.py b/sdk/agenta/sdk/workflows/handlers.py index dc2e7cd20e..7fc8b0e670 100644 --- a/sdk/agenta/sdk/workflows/handlers.py +++ b/sdk/agenta/sdk/workflows/handlers.py @@ -1,50 +1,43 @@ -from typing import List, Optional, Any, Dict, Union -from json import dumps, loads -import traceback import json -import re import math +import re +import traceback +from difflib import SequenceMatcher +from json import dumps, loads +from typing import Any, Dict, List, Optional, Union import httpx - import litellm - -from pydantic import BaseModel, Field -from openai import AsyncOpenAI, OpenAIError -from difflib import SequenceMatcher - -from agenta.sdk.utils.logging import get_module_logger - +from agenta.sdk.decorators.tracing import instrument from agenta.sdk.litellm import mockllm -from agenta.sdk.types import PromptTemplate, Message +from agenta.sdk.litellm.litellm import litellm_handler from agenta.sdk.managers.secrets import SecretsManager - -from agenta.sdk.decorators.tracing import instrument - from agenta.sdk.models.shared import Data -from agenta.sdk.workflows.sandbox import execute_code_safely +from agenta.sdk.types import Message, PromptTemplate +from agenta.sdk.utils.logging import get_module_logger from agenta.sdk.workflows.errors import ( + CustomCodeServerV0Error, InvalidConfigurationParametersV0Error, - MissingConfigurationParameterV0Error, InvalidConfigurationParameterV0Error, InvalidInputsV0Error, - MissingInputV0Error, InvalidInputV0Error, InvalidOutputsV0Error, InvalidSecretsV0Error, JSONDiffV0Error, LevenshteinDistanceV0Error, - SyntacticSimilarityV0Error, + MissingConfigurationParameterV0Error, + MissingInputV0Error, + PromptCompletionV0Error, + PromptFormattingV0Error, + RegexPatternV0Error, SemanticSimilarityV0Error, - WebhookServerV0Error, + SyntacticSimilarityV0Error, WebhookClientV0Error, - CustomCodeServerV0Error, - RegexPatternV0Error, - PromptFormattingV0Error, - PromptCompletionV0Error, + WebhookServerV0Error, ) - -from agenta.sdk.litellm.litellm import litellm_handler +from agenta.sdk.workflows.sandbox import execute_code_safely +from openai import AsyncOpenAI, OpenAIError +from pydantic import BaseModel, Field litellm.logging = False litellm.set_verbose = False @@ -534,34 +527,27 @@ def field_match_test_v0( def _get_nested_value(obj: Any, path: str) -> Any: """ - Get value from nested dict using dot notation path. + Get value from nested object using resolve_any() with graceful None on failure. + + Supports multiple path formats: + - Dot notation: "user.address.city", "items.0.name" + - JSON Path: "$.user.address.city", "$.items[0].name" + - JSON Pointer: "/user/address/city", "/items/0/name" Args: obj: The object to traverse (dict or list) - path: Dot-separated path like "user.address.city" or "items.0.name" + path: Path expression in any supported format Returns: - The value at the path, or None if path doesn't exist + The value at the path, or None if path doesn't exist or resolution fails """ if obj is None: return None - keys = path.split(".") - value = obj - - for key in keys: - if isinstance(value, dict): - value = value.get(key) - elif isinstance(value, list) and key.isdigit(): - idx = int(key) - value = value[idx] if 0 <= idx < len(value) else None - else: - return None - - if value is None: - return None - - return value + try: + return resolve_any(path, obj) + except (KeyError, IndexError, ValueError, TypeError, ImportError): + return None @instrument(annotate=True) From cc255adb693fc5900038336b2365a09df53151e9 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Mon, 5 Jan 2026 21:51:49 +0100 Subject: [PATCH 11/11] Refactor FieldsTagsEditor to use Form.useWatch for correct_answer_key reactivity - Updated FieldsTagsEditor component to utilize Form.useWatch instead of form.getFieldValue for monitoring changes to correct_answer_key. - This change enhances reactivity and ensures the component responds appropriately to form updates. --- .../ConfigureEvaluator/FieldsTagsEditor.tsx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx index c446c95d4b..a96a07a37f 100644 --- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx +++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx @@ -17,7 +17,7 @@ import {useCallback, useEffect, useMemo, useRef, useState} from "react" import {PlusOutlined, SearchOutlined} from "@ant-design/icons" -import {Button, Input, Tag, Tooltip, Typography} from "antd" +import {Button, Form, Input, Tag, Tooltip, Typography} from "antd" import type {FormInstance} from "antd/es/form" import {useAtomValue} from "jotai" @@ -53,8 +53,9 @@ export const FieldsTagsEditor: React.FC = ({ const testcaseSelection = useAtomValue(playgroundSelectedTestcaseAtom) const testcase = testcaseSelection?.testcase - // Get the correct_answer_key from form if available - const formCorrectAnswerKey = form?.getFieldValue(["settings_values", "correct_answer_key"]) + // Watch the correct_answer_key from form to react to changes + // Using Form.useWatch instead of form.getFieldValue for reactivity + const formCorrectAnswerKey = Form.useWatch(["settings_values", "correct_answer_key"], form) const effectiveKey = formCorrectAnswerKey || correctAnswerKey // Check if we can detect fields from testcase