From 2520a198136aa42991ff7870bd2258e204b45be1 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Mon, 22 Dec 2025 20:48:05 +0100
Subject: [PATCH 01/11] feat(api): add archived field to LegacyEvaluator model

---
 api/oss/src/models/api/evaluation_model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/api/oss/src/models/api/evaluation_model.py b/api/oss/src/models/api/evaluation_model.py
index a914ff7c7b..a17ca315ad 100644
--- a/api/oss/src/models/api/evaluation_model.py
+++ b/api/oss/src/models/api/evaluation_model.py
@@ -20,6 +20,7 @@ class LegacyEvaluator(BaseModel):
     oss: Optional[bool] = False
     requires_llm_api_keys: Optional[bool] = False
     tags: List[str]
+    archived: Optional[bool] = False
 
 
 class EvaluatorConfig(BaseModel):

From fb13c912dafe7a2d74e418e8741510f88264bf93 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Mon, 22 Dec 2025 20:48:13 +0100
Subject: [PATCH 02/11] feat(api): introduce JSON Multi-Field Match evaluator
 and deprecate JSON Field Match

---
 .../src/resources/evaluators/evaluators.py    |  28 ++++
 api/oss/src/services/evaluators_service.py    | 143 ++++++++++++++++++
 2 files changed, 171 insertions(+)

diff --git a/api/oss/src/resources/evaluators/evaluators.py b/api/oss/src/resources/evaluators/evaluators.py
index 89d9974f8d..c6b2b189d9 100644
--- a/api/oss/src/resources/evaluators/evaluators.py
+++ b/api/oss/src/resources/evaluators/evaluators.py
@@ -332,6 +332,7 @@
         "name": "JSON Field Match",
         "key": "field_match_test",
         "direct_use": False,
+        "archived": True,  # Deprecated - use json_multi_field_match instead
         "settings_template": {
             "json_field": {
                 "label": "JSON Field",
@@ -355,6 +356,33 @@
         "oss": True,
         "tags": ["classifiers"],
     },
+    {
+        "name": "JSON Multi-Field Match",
+        "key": "json_multi_field_match",
+        "direct_use": False,
+        "settings_template": {
+            "fields": {
+                "label": "Fields to Compare",
+                "type": "fields_checkbox_list",  # Custom type - checkbox list with auto-detection from testcase
+                "required": True,
+                "description": "Select which JSON fields to compare (auto-detected from testcase)",
+            },
+            "correct_answer_key": {
+                "label": "Expected Answer Column",
+                "default": "correct_answer",
+                "type": "string",
+                "required": True,
+                "description": "Column name containing the expected JSON object",
+                "ground_truth_key": True,
+                "advanced": True,  # Hidden in advanced section
+            },
+        },
+        "description": "Compares configured fields in expected JSON against LLM output. Each field becomes a separate score column (0 or 1), with an overall score showing the match ratio. Useful for entity extraction validation.",
+        "requires_testcase": "always",
+        "requires_trace": "always",
+        "oss": True,
+        "tags": ["classifiers"],
+    },
     {
         "name": "JSON Diff Match",
         "key": "auto_json_diff",
diff --git a/api/oss/src/services/evaluators_service.py b/api/oss/src/services/evaluators_service.py
index b866db4b16..25505e4029 100644
--- a/api/oss/src/services/evaluators_service.py
+++ b/api/oss/src/services/evaluators_service.py
@@ -352,6 +352,147 @@ async def field_match_test(input: EvaluatorInputInterface) -> EvaluatorOutputInt
     return {"outputs": {"success": result}}
 
 
+def get_nested_value(obj: Any, path: str) -> Any:
+    """
+    Get value from nested dict/object using dot notation (e.g., 'user.address.city').
+
+    Args:
+        obj: The object to traverse (dict or nested structure)
+        path: Dot-notation path to the value (e.g., 'user.address.city')
+
+    Returns:
+        The value at the specified path, or None if path doesn't exist
+    """
+    if obj is None:
+        return None
+
+    keys = path.split(".")
+    value = obj
+
+    for key in keys:
+        if isinstance(value, dict):
+            value = value.get(key)
+        elif isinstance(value, list) and key.isdigit():
+            # Support array indexing with numeric keys
+            idx = int(key)
+            value = value[idx] if 0 <= idx < len(value) else None
+        else:
+            return None
+
+        if value is None:
+            return None
+
+    return value
+
+
+async def auto_json_multi_field_match(
+    inputs: Dict[str, Any],  # pylint: disable=unused-argument
+    output: Union[str, Dict[str, Any]],
+    data_point: Dict[str, Any],
+    app_params: Dict[str, Any],  # pylint: disable=unused-argument
+    settings_values: Dict[str, Any],
+    lm_providers_keys: Dict[str, Any],  # pylint: disable=unused-argument
+) -> Result:
+    """
+    Evaluator that compares multiple configured fields in expected JSON against LLM output JSON.
+    Each configured field becomes a separate score in the output.
+
+    Returns a Result with:
+    - type="object" containing one score per configured field plus overall score
+    - Each field score is 1.0 (match) or 0.0 (no match)
+    - Overall 'score' is the average of all field scores
+    """
+    try:
+        output = validate_string_output("json_multi_field_match", output)
+        correct_answer = get_correct_answer(data_point, settings_values)
+        eval_inputs = {"ground_truth": correct_answer, "prediction": output}
+        response = await json_multi_field_match(
+            input=EvaluatorInputInterface(
+                **{"inputs": eval_inputs, "settings": settings_values}
+            )
+        )
+        return Result(type="object", value=response["outputs"])
+    except ValueError as e:
+        return Result(
+            type="error",
+            value=None,
+            error=Error(
+                message=str(e),
+            ),
+        )
+    except Exception:
+        return Result(
+            type="error",
+            value=None,
+            error=Error(
+                message="Error during JSON Multi-Field Match evaluation",
+                stacktrace=str(traceback.format_exc()),
+            ),
+        )
+
+
+async def json_multi_field_match(
+    input: EvaluatorInputInterface,
+) -> EvaluatorOutputInterface:
+    """
+    Compare configured fields in expected JSON against LLM output JSON.
+    Each configured field becomes a separate score in the output.
+
+    Args:
+        input: EvaluatorInputInterface with:
+            - inputs.prediction: JSON string from LLM output
+            - inputs.ground_truth: JSON string from test data column
+            - settings.fields: List of field paths (strings) e.g., ["name", "email", "user.address.city"]
+
+    Returns:
+        EvaluatorOutputInterface with one score per configured field plus overall score
+    """
+    fields = input.settings.get("fields", [])
+
+    if not fields:
+        raise ValueError("No fields configured for comparison")
+
+    # Parse both JSON objects
+    prediction = input.inputs.get("prediction", "")
+    ground_truth = input.inputs.get("ground_truth", "")
+
+    try:
+        if isinstance(ground_truth, str):
+            expected = json.loads(ground_truth)
+        else:
+            expected = ground_truth
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON in ground truth: {str(e)}")
+
+    try:
+        if isinstance(prediction, str):
+            actual = json.loads(prediction)
+        else:
+            actual = prediction
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON in prediction: {str(e)}")
+
+    results: Dict[str, Any] = {}
+    matches = 0
+
+    for field_path in fields:
+        # Support nested fields with dot notation
+        expected_val = get_nested_value(expected, field_path)
+        actual_val = get_nested_value(actual, field_path)
+
+        # Exact match comparison (v1 - always exact)
+        match = expected_val == actual_val
+
+        results[field_path] = 1.0 if match else 0.0
+        if match:
+            matches += 1
+
+    # Overall score is average of field scores
+    results["score"] = matches / len(fields) if fields else 0.0
+
+    return {"outputs": results}
+
+
 async def auto_webhook_test(
     inputs: Dict[str, Any],
     output: Union[str, Dict[str, Any]],
@@ -1987,6 +2128,7 @@ async def auto_semantic_similarity(
     "auto_exact_match": auto_exact_match,
     "auto_regex_test": auto_regex_test,
     "field_match_test": auto_field_match_test,
+    "json_multi_field_match": auto_json_multi_field_match,
     "auto_webhook_test": auto_webhook_test,
     "auto_custom_code_run": auto_custom_code_run,
     "auto_ai_critique": auto_ai_critique,
@@ -2008,6 +2150,7 @@ async def auto_semantic_similarity(
     "auto_exact_match": exact_match,
     "auto_regex_test": regex_test,
     "field_match_test": field_match_test,
+    "json_multi_field_match": json_multi_field_match,
     "auto_webhook_test": webhook_test,
     "auto_custom_code_run": custom_code_run,
     "auto_ai_critique": ai_critique,

From 70f1cb66f54e52d0aadaf2798fb315b3cf15b656 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Tue, 30 Dec 2025 20:20:08 +0100
Subject: [PATCH 03/11] feat(api): implement JSON Multi-Field Match evaluator
 with dynamic scoring and update configurations

---
 api/oss/src/core/evaluators/service.py     |  22 +++
 sdk/agenta/sdk/workflows/configurations.py |   1 +
 sdk/agenta/sdk/workflows/handlers.py       | 149 +++++++++++++++++++++
 sdk/agenta/sdk/workflows/interfaces.py     |  52 +++++++
 sdk/agenta/sdk/workflows/utils.py          |   6 +
 5 files changed, 230 insertions(+)

diff --git a/api/oss/src/core/evaluators/service.py b/api/oss/src/core/evaluators/service.py
index 6e547addfe..8a2d38955d 100644
--- a/api/oss/src/core/evaluators/service.py
+++ b/api/oss/src/core/evaluators/service.py
@@ -1435,11 +1435,33 @@ def _transfer_evaluator_revision_data(
             else None
         )
         headers = None
+        # TODO: This function reconstructs output schemas from old evaluator settings.
+        # When fully migrating to the new workflow-based evaluator system, the output
+        # schema should be stored directly in the evaluator revision (workflow revision)
+        # at configuration time, rather than being inferred from settings here.
+        # For evaluators with dynamic outputs (auto_ai_critique, json_multi_field_match),
+        # the frontend/API should build and save the complete output schema when the
+        # user configures the evaluator.
         outputs_schema = None
         if str(old_evaluator.evaluator_key) == "auto_ai_critique":
             json_schema = old_evaluator.settings_values.get("json_schema", None)
             if json_schema and isinstance(json_schema, dict):
                 outputs_schema = json_schema.get("schema", None)
+        # Handle json_multi_field_match with dynamic field-based properties
+        if str(old_evaluator.evaluator_key) == "json_multi_field_match":
+            # Build dynamic properties based on configured fields
+            fields = old_evaluator.settings_values.get("fields", [])
+            properties = {"score": {"type": "number"}}
+            for field in fields:
+                # Each field becomes a numeric score (0 or 1)
+                properties[field] = {"type": "number"}
+            outputs_schema = {
+                "$schema": "https://json-schema.org/draft/2020-12/schema",
+                "type": "object",
+                "properties": properties,
+                "required": ["score"],
+                "additionalProperties": False,
+            }
         if not outputs_schema:
             properties = (
                 {"score": {"type": "number"}, "success": {"type": "boolean"}}
diff --git a/sdk/agenta/sdk/workflows/configurations.py b/sdk/agenta/sdk/workflows/configurations.py
index 9086047c53..42310b9368 100644
--- a/sdk/agenta/sdk/workflows/configurations.py
+++ b/sdk/agenta/sdk/workflows/configurations.py
@@ -5,6 +5,7 @@
 auto_exact_match_v0_configuration = WorkflowServiceConfiguration()
 auto_regex_test_v0_configuration = WorkflowServiceConfiguration()
 field_match_test_v0_configuration = WorkflowServiceConfiguration()
+json_multi_field_match_v0_configuration = WorkflowServiceConfiguration()
 auto_webhook_test_v0_configuration = WorkflowServiceConfiguration()
 auto_custom_code_run_v0_configuration = WorkflowServiceConfiguration()
 auto_ai_critique_v0_configuration = WorkflowServiceConfiguration()
diff --git a/sdk/agenta/sdk/workflows/handlers.py b/sdk/agenta/sdk/workflows/handlers.py
index 7216761897..c628fa21a1 100644
--- a/sdk/agenta/sdk/workflows/handlers.py
+++ b/sdk/agenta/sdk/workflows/handlers.py
@@ -537,6 +537,155 @@ def field_match_test_v0(
     return {"success": success}
 
 
+def _get_nested_value(obj: Any, path: str) -> Any:
+    """
+    Get value from nested dict using dot notation path.
+
+    Args:
+        obj: The object to traverse (dict or list)
+        path: Dot-separated path like "user.address.city" or "items.0.name"
+
+    Returns:
+        The value at the path, or None if path doesn't exist
+    """
+    if obj is None:
+        return None
+
+    keys = path.split(".")
+    value = obj
+
+    for key in keys:
+        if isinstance(value, dict):
+            value = value.get(key)
+        elif isinstance(value, list) and key.isdigit():
+            idx = int(key)
+            value = value[idx] if 0 <= idx < len(value) else None
+        else:
+            return None
+
+        if value is None:
+            return None
+
+    return value
+
+
+@instrument(annotate=True)
+def json_multi_field_match_v0(
+    parameters: Optional[Data] = None,
+    inputs: Optional[Data] = None,
+    outputs: Optional[Union[Data, str]] = None,
+) -> Any:
+    """
+    Multi-field JSON match evaluator for comparing multiple fields between expected and actual JSON.
+
+    Each configured field becomes a separate score (0 or 1), and an overall score shows
+    the ratio of matching fields. Useful for entity extraction validation.
+
+    Args:
+        inputs: Testcase data with ground truth JSON
+        outputs: Output from the workflow execution (expected to be JSON string or dict)
+        parameters: Configuration with:
+            - fields: List of field paths to compare (e.g., ["name", "user.address.city"])
+            - correct_answer_key: Key in inputs containing the expected JSON
+
+    Returns:
+        Dict with per-field scores and overall score, e.g.:
+        {"name": 1.0, "email": 0.0, "score": 0.5}
+    """
+    if parameters is None or not isinstance(parameters, dict):
+        raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
+
+    if "fields" not in parameters:
+        raise MissingConfigurationParameterV0Error(path="fields")
+
+    fields = parameters["fields"]
+
+    if not isinstance(fields, list) or len(fields) == 0:
+        raise InvalidConfigurationParameterV0Error(
+            path="fields",
+            expected="non-empty list",
+            got=fields,
+        )
+
+    if "correct_answer_key" not in parameters:
+        raise MissingConfigurationParameterV0Error(path="correct_answer_key")
+
+    correct_answer_key = str(parameters["correct_answer_key"])
+
+    if inputs is None or not isinstance(inputs, dict):
+        raise InvalidInputsV0Error(expected="dict", got=inputs)
+
+    if correct_answer_key not in inputs:
+        raise MissingInputV0Error(path=correct_answer_key)
+
+    correct_answer = inputs[correct_answer_key]
+
+    # Parse ground truth JSON
+    if isinstance(correct_answer, str):
+        try:
+            expected = json.loads(correct_answer)
+        except json.JSONDecodeError as e:
+            raise InvalidInputV0Error(
+                path=correct_answer_key,
+                expected="valid JSON string",
+                got=correct_answer,
+            )
+    elif isinstance(correct_answer, dict):
+        expected = correct_answer
+    else:
+        raise InvalidInputV0Error(
+            path=correct_answer_key,
+            expected=["dict", "str"],
+            got=correct_answer,
+        )
+
+    # Parse output JSON
+    if not isinstance(outputs, str) and not isinstance(outputs, dict):
+        # Return all zeros if output is invalid
+        results: Dict[str, Any] = {field: 0.0 for field in fields}
+        results["score"] = 0.0
+        return results
+
+    if isinstance(outputs, str):
+        try:
+            actual = json.loads(outputs)
+        except json.JSONDecodeError:
+            # Return all zeros if output is not valid JSON
+            results = {field: 0.0 for field in fields}
+            results["score"] = 0.0
+            return results
+    else:
+        actual = outputs
+
+    if not isinstance(actual, dict):
+        # Return all zeros if parsed output is not a dict
+        results = {field: 0.0 for field in fields}
+        results["score"] = 0.0
+        return results
+
+    # --------------------------------------------------------------------------
+    # Compare each configured field
+    results = {}
+    matches = 0
+
+    for field_path in fields:
+        expected_val = _get_nested_value(expected, field_path)
+        actual_val = _get_nested_value(actual, field_path)
+
+        # Exact match comparison
+        match = expected_val == actual_val
+
+        results[field_path] = 1.0 if match else 0.0
+        if match:
+            matches += 1
+
+    # Overall score is ratio of matching fields
+    results["score"] = matches / len(fields) if fields else 0.0
+    # --------------------------------------------------------------------------
+
+    return results
+
+
 @instrument(annotate=True)
 async def auto_webhook_test_v0(
     parameters: Optional[Data] = None,
diff --git a/sdk/agenta/sdk/workflows/interfaces.py b/sdk/agenta/sdk/workflows/interfaces.py
index 85334ab6cb..d9a8425d80 100644
--- a/sdk/agenta/sdk/workflows/interfaces.py
+++ b/sdk/agenta/sdk/workflows/interfaces.py
@@ -169,6 +169,58 @@
     ),
 )
 
+json_multi_field_match_v0_interface = WorkflowServiceInterface(
+    uri="agenta:built-in:json_multi_field_match:v0",
+    schemas=dict(  # type: ignore
+        parameters={
+            "type": "object",
+            "title": "JSON Multi-Field Match Parameters",
+            "description": "Settings for comparing multiple JSON fields against expected values from a ground truth column.",
+            "properties": {
+                "correct_answer_key": {
+                    "type": "string",
+                    "title": "Ground Truth Column",
+                    "description": "Column in test data containing the JSON ground truth.",
+                    "default": "correct_answer",
+                },
+                "fields": {
+                    "type": "array",
+                    "title": "Fields to Compare",
+                    "description": "List of JSON field paths (dot notation) to compare. Each field becomes a separate score.",
+                    "items": {"type": "string"},
+                    "default": [],
+                },
+            },
+            "required": ["correct_answer_key", "fields"],
+            "additionalProperties": False,
+        },
+        inputs={
+            "type": "object",
+            "title": "JSON Multi-Field Match Inputs",
+            "description": "Testcase data including the JSON ground truth.",
+        },
+        outputs={
+            "type": "object",
+            "title": "JSON Multi-Field Match Outputs",
+            "description": "Per-field match scores and overall match ratio. Each field produces a score_<field> output (0 or 1).",
+            "properties": {
+                "score": {
+                    "type": "number",
+                    "title": "Overall Score",
+                    "description": "Ratio of matched fields (0-1).",
+                },
+                "success": {
+                    "type": "boolean",
+                    "title": "Success",
+                    "description": "True if all selected fields matched.",
+                },
+            },
+            "required": ["score", "success"],
+            "additionalProperties": True,  # Allows dynamic score_<field> outputs
+        },
+    ),
+)
+
 auto_webhook_test_v0_interface = WorkflowServiceInterface(
     uri="agenta:built-in:auto_webhook_test:v0",
     schemas=dict(  # type: ignore
diff --git a/sdk/agenta/sdk/workflows/utils.py b/sdk/agenta/sdk/workflows/utils.py
index d86f499da4..2ecd57d219 100644
--- a/sdk/agenta/sdk/workflows/utils.py
+++ b/sdk/agenta/sdk/workflows/utils.py
@@ -9,6 +9,7 @@
     auto_exact_match_v0,
     auto_regex_test_v0,
     field_match_test_v0,
+    json_multi_field_match_v0,
     auto_webhook_test_v0,
     auto_custom_code_run_v0,
     auto_ai_critique_v0,
@@ -31,6 +32,7 @@
     auto_exact_match_v0_interface,
     auto_regex_test_v0_interface,
     field_match_test_v0_interface,
+    json_multi_field_match_v0_interface,
     auto_webhook_test_v0_interface,
     auto_custom_code_run_v0_interface,
     auto_ai_critique_v0_interface,
@@ -54,6 +56,7 @@
     auto_exact_match_v0_configuration,
     auto_regex_test_v0_configuration,
     field_match_test_v0_configuration,
+    json_multi_field_match_v0_configuration,
     auto_webhook_test_v0_configuration,
     auto_custom_code_run_v0_configuration,
     auto_ai_critique_v0_configuration,
@@ -78,6 +81,7 @@
             auto_exact_match=dict(v0=auto_exact_match_v0_interface),
             auto_regex_test=dict(v0=auto_regex_test_v0_interface),
             field_match_test=dict(v0=field_match_test_v0_interface),
+            json_multi_field_match=dict(v0=json_multi_field_match_v0_interface),
             auto_webhook_test=dict(v0=auto_webhook_test_v0_interface),
             auto_custom_code_run=dict(v0=auto_custom_code_run_v0_interface),
             auto_ai_critique=dict(v0=auto_ai_critique_v0_interface),
@@ -104,6 +108,7 @@
             auto_exact_match=dict(v0=auto_exact_match_v0_configuration),
             auto_regex_test=dict(v0=auto_regex_test_v0_configuration),
             field_match_test=dict(v0=field_match_test_v0_configuration),
+            json_multi_field_match=dict(v0=json_multi_field_match_v0_configuration),
             auto_webhook_test=dict(v0=auto_webhook_test_v0_configuration),
             auto_custom_code_run=dict(v0=auto_custom_code_run_v0_configuration),
             auto_ai_critique=dict(v0=auto_ai_critique_v0_configuration),
@@ -160,6 +165,7 @@
             auto_exact_match=dict(v0=auto_exact_match_v0),
             auto_regex_test=dict(v0=auto_regex_test_v0),
             field_match_test=dict(v0=field_match_test_v0),
+            json_multi_field_match=dict(v0=json_multi_field_match_v0),
             auto_webhook_test=dict(v0=auto_webhook_test_v0),
             auto_custom_code_run=dict(v0=auto_custom_code_run_v0),
             auto_ai_critique=dict(v0=auto_ai_critique_v0),

From 7c32857f1872141cbdd2138b351f3c0b86633e7c Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Tue, 30 Dec 2025 20:20:45 +0100
Subject: [PATCH 04/11] feat(api): add json_multi_field_match evaluator and
 corresponding icon mapping

---
 web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts | 1 +
 web/oss/src/services/evaluators/index.ts                       | 1 +
 2 files changed, 2 insertions(+)

diff --git a/web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts b/web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts
index ec01f427c4..ff617479c5 100644
--- a/web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts
+++ b/web/oss/src/components/Evaluators/assets/evaluatorFiltering.ts
@@ -20,6 +20,7 @@ export const ENABLED_EVALUATORS = [
     "auto_semantic_similarity",
     "auto_regex_test",
     "field_match_test",
+    "json_multi_field_match",
     "auto_json_diff",
     "auto_ai_critique",
     "auto_custom_code_run",
diff --git a/web/oss/src/services/evaluators/index.ts b/web/oss/src/services/evaluators/index.ts
index 8654f1e59f..e0f648fad8 100644
--- a/web/oss/src/services/evaluators/index.ts
+++ b/web/oss/src/services/evaluators/index.ts
@@ -67,6 +67,7 @@ const evaluatorIconsMap = {
     auto_similarity_match: similarityImg,
     auto_regex_test: regexImg,
     field_match_test: exactMatchImg,
+    json_multi_field_match: bracketCurlyImg,
     auto_webhook_test: webhookImg,
     auto_ai_critique: aiImg,
     auto_custom_code_run: codeImg,

From 64f5c1435d7465afe9f9a2d5f09ea559c9bfa4ac Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Tue, 30 Dec 2025 20:54:17 +0100
Subject: [PATCH 05/11] refactor(api): update json_multi_field_match evaluator
 to use aggregate_score instead of score and enhance field comparison
 descriptions

---
 api/oss/src/core/evaluators/service.py         |  4 ++--
 api/oss/src/resources/evaluators/evaluators.py |  6 +++---
 sdk/agenta/sdk/workflows/handlers.py           | 18 +++++++++---------
 sdk/agenta/sdk/workflows/interfaces.py         | 17 ++++++-----------
 4 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/api/oss/src/core/evaluators/service.py b/api/oss/src/core/evaluators/service.py
index 8a2d38955d..076e82439a 100644
--- a/api/oss/src/core/evaluators/service.py
+++ b/api/oss/src/core/evaluators/service.py
@@ -1451,7 +1451,7 @@ def _transfer_evaluator_revision_data(
         if str(old_evaluator.evaluator_key) == "json_multi_field_match":
             # Build dynamic properties based on configured fields
             fields = old_evaluator.settings_values.get("fields", [])
-            properties = {"score": {"type": "number"}}
+            properties = {"aggregate_score": {"type": "number"}}
             for field in fields:
                 # Each field becomes a numeric score (0 or 1)
                 properties[field] = {"type": "number"}
@@ -1459,7 +1459,7 @@ def _transfer_evaluator_revision_data(
                 "$schema": "https://json-schema.org/draft/2020-12/schema",
                 "type": "object",
                 "properties": properties,
-                "required": ["score"],
+                "required": ["aggregate_score"],
                 "additionalProperties": False,
             }
         if not outputs_schema:
diff --git a/api/oss/src/resources/evaluators/evaluators.py b/api/oss/src/resources/evaluators/evaluators.py
index c6b2b189d9..d4f4965f7a 100644
--- a/api/oss/src/resources/evaluators/evaluators.py
+++ b/api/oss/src/resources/evaluators/evaluators.py
@@ -363,9 +363,9 @@
         "settings_template": {
             "fields": {
                 "label": "Fields to Compare",
-                "type": "fields_checkbox_list",  # Custom type - checkbox list with auto-detection from testcase
+                "type": "fields_tags_editor",  # Custom type - tag-based add/remove editor
                 "required": True,
-                "description": "Select which JSON fields to compare (auto-detected from testcase)",
+                "description": "Add fields to compare using dot notation for nested paths (e.g., user.name)",
             },
             "correct_answer_key": {
                 "label": "Expected Answer Column",
@@ -377,7 +377,7 @@
                 "advanced": True,  # Hidden in advanced section
             },
         },
-        "description": "Compares configured fields in expected JSON against LLM output. Each field becomes a separate score column (0 or 1), with an overall score showing the match ratio. Useful for entity extraction validation.",
+        "description": "Compares configured fields in expected JSON against LLM output. Each field becomes a separate metric (0 or 1), with an aggregate_score showing the percentage of matching fields. Useful for entity extraction validation.",
         "requires_testcase": "always",
         "requires_trace": "always",
         "oss": True,
diff --git a/sdk/agenta/sdk/workflows/handlers.py b/sdk/agenta/sdk/workflows/handlers.py
index c628fa21a1..74831ddb2b 100644
--- a/sdk/agenta/sdk/workflows/handlers.py
+++ b/sdk/agenta/sdk/workflows/handlers.py
@@ -578,8 +578,8 @@ def json_multi_field_match_v0(
     """
     Multi-field JSON match evaluator for comparing multiple fields between expected and actual JSON.
 
-    Each configured field becomes a separate score (0 or 1), and an overall score shows
-    the ratio of matching fields. Useful for entity extraction validation.
+    Each configured field becomes a separate score (0 or 1), and an aggregate_score shows
+    the percentage of matching fields. Useful for entity extraction validation.
 
     Args:
         inputs: Testcase data with ground truth JSON
@@ -589,8 +589,8 @@ def json_multi_field_match_v0(
             - correct_answer_key: Key in inputs containing the expected JSON
 
     Returns:
-        Dict with per-field scores and overall score, e.g.:
-        {"name": 1.0, "email": 0.0, "score": 0.5}
+        Dict with per-field scores and aggregate_score, e.g.:
+        {"name": 1.0, "email": 0.0, "aggregate_score": 0.5}
     """
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
@@ -643,7 +643,7 @@ def json_multi_field_match_v0(
     if not isinstance(outputs, str) and not isinstance(outputs, dict):
         # Return all zeros if output is invalid
         results: Dict[str, Any] = {field: 0.0 for field in fields}
-        results["score"] = 0.0
+        results["aggregate_score"] = 0.0
         return results
 
     if isinstance(outputs, str):
@@ -652,7 +652,7 @@ def json_multi_field_match_v0(
         except json.JSONDecodeError:
             # Return all zeros if output is not valid JSON
             results = {field: 0.0 for field in fields}
-            results["score"] = 0.0
+            results["aggregate_score"] = 0.0
             return results
     else:
         actual = outputs
@@ -660,7 +660,7 @@ def json_multi_field_match_v0(
     if not isinstance(actual, dict):
         # Return all zeros if parsed output is not a dict
         results = {field: 0.0 for field in fields}
-        results["score"] = 0.0
+        results["aggregate_score"] = 0.0
         return results
 
     # --------------------------------------------------------------------------
@@ -679,8 +679,8 @@ def json_multi_field_match_v0(
         if match:
             matches += 1
 
-    # Overall score is ratio of matching fields
-    results["score"] = matches / len(fields) if fields else 0.0
+    # Aggregate score is the percentage of matching fields
+    results["aggregate_score"] = matches / len(fields) if fields else 0.0
     # --------------------------------------------------------------------------
 
     return results
diff --git a/sdk/agenta/sdk/workflows/interfaces.py b/sdk/agenta/sdk/workflows/interfaces.py
index d9a8425d80..6c1e5edfbf 100644
--- a/sdk/agenta/sdk/workflows/interfaces.py
+++ b/sdk/agenta/sdk/workflows/interfaces.py
@@ -202,21 +202,16 @@
         outputs={
             "type": "object",
             "title": "JSON Multi-Field Match Outputs",
-            "description": "Per-field match scores and overall match ratio. Each field produces a score_<field> output (0 or 1).",
+            "description": "Per-field match scores and aggregate score. Each field produces a 0 or 1 output.",
             "properties": {
-                "score": {
+                "aggregate_score": {
                     "type": "number",
-                    "title": "Overall Score",
-                    "description": "Ratio of matched fields (0-1).",
-                },
-                "success": {
-                    "type": "boolean",
-                    "title": "Success",
-                    "description": "True if all selected fields matched.",
+                    "title": "Aggregate Score",
+                    "description": "Percentage of matched fields (0-1).",
                 },
             },
-            "required": ["score", "success"],
-            "additionalProperties": True,  # Allows dynamic score_<field> outputs
+            "required": ["aggregate_score"],
+            "additionalProperties": True,  # Allows dynamic field outputs
         },
     ),
 )

From 4a3065eb74b187c488c2e0e62a70c7383ced82fa Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Tue, 30 Dec 2025 20:54:44 +0100
Subject: [PATCH 06/11] feat(frontend): add FieldsTagsEditor component for
 managing JSON field paths in evaluations

---
 .../ConfigureEvaluator/DynamicFormField.tsx   |   3 +
 .../ConfigureEvaluator/FieldsTagsEditor.tsx   | 283 ++++++++++++++++++
 web/oss/src/lib/Types.ts                      |   1 +
 web/oss/src/lib/helpers/extractJsonPaths.ts   |  86 ++++++
 4 files changed, 373 insertions(+)
 create mode 100644 web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx
 create mode 100644 web/oss/src/lib/helpers/extractJsonPaths.ts

diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx
index 3098026f1b..85f09fd173 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx
@@ -11,6 +11,7 @@ import {isValidRegex} from "@/oss/lib/helpers/validators"
 import {generatePaths} from "@/oss/lib/transformers"
 import {EvaluationSettingsTemplate, JSSTheme} from "@/oss/lib/Types"
 
+import {FieldsTagsEditor} from "./FieldsTagsEditor"
 import {JSONSchemaEditor} from "./JSONSchema"
 import {Messages} from "./Messages"
 
@@ -215,6 +216,8 @@ export const DynamicFormField: React.FC<DynamicFormFieldProps> = ({
                                     : JSON.stringify(savedValue ?? {}, null, 2)
                             }
                         />
+                    ) : type === "fields_tags_editor" ? (
+                        <FieldsTagsEditor form={form} name={name} />
                     ) : null}
                 </Form.Item>
             )}
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx
new file mode 100644
index 0000000000..712bedd111
--- /dev/null
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx
@@ -0,0 +1,283 @@
+/**
+ * FieldsTagsEditor - Tag-based editor for JSON field paths
+ *
+ * This component provides an add/remove interface for managing JSON field paths.
+ * Users can:
+ * - Add fields manually using an input field (supports dot notation for nested paths)
+ * - Remove fields by clicking the X button on tags
+ * - Detect fields from the selected testcase using a dedicated button
+ *
+ * The component also displays a non-removable "overall" field representing
+ * the aggregate result across all fields.
+ *
+ * Auto-detection behavior:
+ * - When a testcase is loaded and no fields are configured, fields are auto-detected
+ */
+
+import {useCallback, useEffect, useMemo, useRef, useState} from "react"
+
+import {CloseOutlined, PlusOutlined, SearchOutlined} from "@ant-design/icons"
+import {Button, Input, Space, Tag, theme, Tooltip, Typography} from "antd"
+import type {FormInstance} from "antd/es/form"
+import {useAtomValue} from "jotai"
+import {createUseStyles} from "react-jss"
+
+import {extractJsonPaths, safeParseJson} from "@/oss/lib/helpers/extractJsonPaths"
+import type {JSSTheme} from "@/oss/lib/Types"
+
+import {playgroundSelectedTestcaseAtom} from "./state/atoms"
+
+const {Text} = Typography
+
+const useStyles = createUseStyles((theme: JSSTheme) => ({
+    container: {
+        display: "flex",
+        flexDirection: "column",
+        gap: 12,
+    },
+    tagsContainer: {
+        display: "flex",
+        flexWrap: "wrap",
+        gap: 8,
+        padding: 12,
+        borderRadius: 6,
+        border: `1px solid ${theme.colorBorder}`,
+        backgroundColor: theme.colorBgContainer,
+        minHeight: 48,
+    },
+    fieldTag: {
+        display: "flex",
+        alignItems: "center",
+        fontFamily: "monospace",
+        fontSize: 13,
+        margin: 0,
+    },
+    matchRatioTag: {
+        fontFamily: "monospace",
+        fontSize: 13,
+        margin: 0,
+        fontWeight: 500,
+    },
+    addFieldRow: {
+        display: "flex",
+        gap: 8,
+    },
+    addInput: {
+        flex: 1,
+        fontFamily: "monospace",
+    },
+    actionsRow: {
+        display: "flex",
+        alignItems: "center",
+        justifyContent: "space-between",
+    },
+    helperText: {
+        fontSize: 12,
+        color: theme.colorTextSecondary,
+    },
+    emptyMessage: {
+        color: theme.colorTextSecondary,
+        fontSize: 13,
+    },
+}))
+
+interface FieldsTagsEditorProps {
+    value?: string[]
+    onChange?: (value: string[]) => void
+    form?: FormInstance
+    name?: string | string[]
+    correctAnswerKey?: string
+}
+
+/**
+ * Tag-based editor for managing JSON field paths with add/remove functionality.
+ * Includes "Detect from testcase" feature to auto-populate fields.
+ */
+export const FieldsTagsEditor: React.FC<FieldsTagsEditorProps> = ({
+    value = [],
+    onChange,
+    form,
+    correctAnswerKey = "correct_answer",
+}) => {
+    const classes = useStyles()
+    const {token} = theme.useToken()
+    const [inputValue, setInputValue] = useState("")
+    // Track if we've already auto-detected to avoid re-triggering
+    const hasAutoDetectedRef = useRef(false)
+
+    // Read the selected testcase from the playground atom
+    const testcaseSelection = useAtomValue(playgroundSelectedTestcaseAtom)
+    const testcase = testcaseSelection?.testcase
+
+    // Get the correct_answer_key from form if available
+    const formCorrectAnswerKey = form?.getFieldValue(["settings_values", "correct_answer_key"])
+    const effectiveKey = formCorrectAnswerKey || correctAnswerKey
+
+    // Check if we can detect fields from testcase
+    const canDetectFields = useMemo(() => {
+        if (!testcase) return false
+        const groundTruthValue = testcase[effectiveKey]
+        if (!groundTruthValue) return false
+        const parsed = safeParseJson(groundTruthValue)
+        return parsed !== null
+    }, [testcase, effectiveKey])
+
+    // Extract available fields from the testcase
+    const detectableFields = useMemo(() => {
+        if (!testcase) return []
+        const groundTruthValue = testcase[effectiveKey]
+        if (!groundTruthValue) return []
+        const parsed = safeParseJson(groundTruthValue)
+        if (!parsed) return []
+        return extractJsonPaths(parsed)
+    }, [testcase, effectiveKey])
+
+    // Auto-detect fields when testcase is loaded and no fields are configured
+    useEffect(() => {
+        // Only auto-detect if:
+        // 1. We haven't already auto-detected
+        // 2. There are no user-defined fields
+        // 3. We can detect fields from the testcase
+        if (!hasAutoDetectedRef.current && value.length === 0 && detectableFields.length > 0) {
+            hasAutoDetectedRef.current = true
+            onChange?.(detectableFields)
+        }
+    }, [detectableFields, value.length, onChange])
+
+    // Handle adding a new field
+    const handleAddField = useCallback(() => {
+        const trimmed = inputValue.trim()
+        if (!trimmed) return
+
+        // Don't add duplicates
+        if (value.includes(trimmed)) {
+            setInputValue("")
+            return
+        }
+
+        // Don't allow reserved field names
+        if (trimmed === "aggregate_score") {
+            setInputValue("")
+            return
+        }
+
+        onChange?.([...value, trimmed])
+        setInputValue("")
+    }, [inputValue, value, onChange])
+
+    // Handle removing a field
+    const handleRemoveField = useCallback(
+        (fieldToRemove: string) => {
+            onChange?.(value.filter((f) => f !== fieldToRemove))
+        },
+        [value, onChange],
+    )
+
+    // Handle detecting fields from testcase (replaces existing fields)
+    const handleDetectFields = useCallback(() => {
+        if (detectableFields.length > 0) {
+            onChange?.(detectableFields)
+        }
+    }, [detectableFields, onChange])
+
+    // Handle Enter key in input
+    const handleInputKeyDown = useCallback(
+        (e: React.KeyboardEvent) => {
+            if (e.key === "Enter") {
+                e.preventDefault()
+                handleAddField()
+            }
+        },
+        [handleAddField],
+    )
+
+    // Generate tooltip for disabled detect button
+    const detectButtonTooltip = useMemo(() => {
+        if (!testcase) {
+            return "Select a testcase first to detect fields"
+        }
+        if (!canDetectFields) {
+            return `No JSON object found in the "${effectiveKey}" column`
+        }
+        return `Detect ${detectableFields.length} field(s) from testcase (replaces current fields)`
+    }, [testcase, canDetectFields, effectiveKey, detectableFields.length])
+
+    return (
+        <div className={classes.container}>
+            {/* Field Tags Display */}
+            <div className={classes.tagsContainer}>
+                {/* Non-removable aggregate_score tag */}
+                <Tooltip title="Aggregate score across all fields (auto-generated)">
+                    <Tag color="success" className={classes.matchRatioTag}>
+                        aggregate_score
+                    </Tag>
+                </Tooltip>
+
+                {/* User-defined field tags */}
+                {value.map((field) => (
+                    <Tag
+                        key={field}
+                        closable
+                        onClose={() => handleRemoveField(field)}
+                        className={classes.fieldTag}
+                    >
+                        {field}
+                    </Tag>
+                ))}
+
+                {/* Empty state message */}
+                {value.length === 0 && (
+                    <Text className={classes.emptyMessage}>
+                        Add fields to compare or detect them from a testcase
+                    </Text>
+                )}
+            </div>
+
+            {/* Add Field Input */}
+            <div className={classes.addFieldRow}>
+                <Input
+                    className={classes.addInput}
+                    placeholder="Add field (e.g., name or user.address.city)"
+                    value={inputValue}
+                    onChange={(e) => setInputValue(e.target.value)}
+                    onKeyDown={handleInputKeyDown}
+                    suffix={
+                        <Tooltip title="Use dot notation for nested fields (e.g., user.name)">
+                            <Text type="secondary" style={{fontSize: 11}}>
+                                ?
+                            </Text>
+                        </Tooltip>
+                    }
+                />
+                <Button
+                    icon={<PlusOutlined />}
+                    onClick={handleAddField}
+                    disabled={!inputValue.trim()}
+                >
+                    Add
+                </Button>
+            </div>
+
+            {/* Actions Row */}
+            <div className={classes.actionsRow}>
+                <Text className={classes.helperText}>
+                    Each field creates a column with value 0 (no match) or 1 (match)
+                </Text>
+
+                <Tooltip title={detectButtonTooltip}>
+                    <Button
+                        type="default"
+                        size="small"
+                        icon={<SearchOutlined />}
+                        onClick={handleDetectFields}
+                        disabled={!canDetectFields}
+                    >
+                        Detect from testcase
+                    </Button>
+                </Tooltip>
+            </div>
+        </div>
+    )
+}
+
+export default FieldsTagsEditor
diff --git a/web/oss/src/lib/Types.ts b/web/oss/src/lib/Types.ts
index c33b0ea498..4ca2ae5555 100644
--- a/web/oss/src/lib/Types.ts
+++ b/web/oss/src/lib/Types.ts
@@ -990,6 +990,7 @@ type ValueTypeOptions =
     | "messages"
     | "multiple_choice"
     | "llm_response_schema"
+    | "fields_checkbox_list"
 
 export interface EvaluationSettingsTemplate {
     type: ValueTypeOptions
diff --git a/web/oss/src/lib/helpers/extractJsonPaths.ts b/web/oss/src/lib/helpers/extractJsonPaths.ts
new file mode 100644
index 0000000000..62176319e5
--- /dev/null
+++ b/web/oss/src/lib/helpers/extractJsonPaths.ts
@@ -0,0 +1,86 @@
+/**
+ * Utility functions for extracting JSON paths from objects.
+ * Used by the JSON Multi-Field Match evaluator to auto-detect fields from testcase data.
+ */
+
+/**
+ * Recursively extracts all leaf paths from a JSON object using dot notation.
+ *
+ * Example:
+ * Input: {user: {name: "John", address: {city: "NYC"}}}
+ * Output: ["user.name", "user.address.city"]
+ *
+ * @param obj - The object to extract paths from
+ * @param prefix - Current path prefix (used for recursion)
+ * @returns Array of dot-notation paths to all leaf values
+ */
+export const extractJsonPaths = (obj: unknown, prefix = ""): string[] => {
+    if (obj === null || obj === undefined) return []
+    if (typeof obj !== "object") return prefix ? [prefix] : []
+
+    // For arrays, we don't expand individual indices - just mark the path
+    // This keeps the UI manageable and matches common use cases
+    if (Array.isArray(obj)) {
+        return prefix ? [prefix] : []
+    }
+
+    const paths: string[] = []
+
+    for (const key of Object.keys(obj as Record<string, unknown>)) {
+        const newPrefix = prefix ? `${prefix}.${key}` : key
+        const value = (obj as Record<string, unknown>)[key]
+
+        if (value !== null && typeof value === "object" && !Array.isArray(value)) {
+            // Recurse into nested objects
+            paths.push(...extractJsonPaths(value, newPrefix))
+        } else {
+            // Leaf node (primitive, array, or null)
+            paths.push(newPrefix)
+        }
+    }
+
+    return paths
+}
+
+/**
+ * Parses a JSON string and extracts all paths.
+ * Returns empty array if parsing fails.
+ *
+ * @param jsonString - JSON string to parse and extract paths from
+ * @returns Array of dot-notation paths
+ */
+export const extractJsonPathsFromString = (jsonString: string): string[] => {
+    try {
+        const parsed = JSON.parse(jsonString)
+        return extractJsonPaths(parsed)
+    } catch {
+        return []
+    }
+}
+
+/**
+ * Safely parses a value that might be JSON string or already an object.
+ *
+ * @param value - Value to parse (string or object)
+ * @returns Parsed object or null if invalid
+ */
+export const safeParseJson = (value: unknown): Record<string, unknown> | null => {
+    if (value === null || value === undefined) return null
+
+    if (typeof value === "object" && !Array.isArray(value)) {
+        return value as Record<string, unknown>
+    }
+
+    if (typeof value === "string") {
+        try {
+            const parsed = JSON.parse(value)
+            if (typeof parsed === "object" && parsed !== null && !Array.isArray(parsed)) {
+                return parsed
+            }
+        } catch {
+            return null
+        }
+    }
+
+    return null
+}

From d47fc55fc84e2d51c3f3d82490a2e1693661dd26 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Tue, 30 Dec 2025 21:03:12 +0100
Subject: [PATCH 07/11] refactor(frontend): simplify FieldsTagsEditor component
 by removing unused styles and optimizing class names

---
 .../ConfigureEvaluator/FieldsTagsEditor.tsx   | 80 +++----------------
 1 file changed, 12 insertions(+), 68 deletions(-)

diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx
index 712bedd111..c446c95d4b 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx
@@ -16,71 +16,17 @@
 
 import {useCallback, useEffect, useMemo, useRef, useState} from "react"
 
-import {CloseOutlined, PlusOutlined, SearchOutlined} from "@ant-design/icons"
-import {Button, Input, Space, Tag, theme, Tooltip, Typography} from "antd"
+import {PlusOutlined, SearchOutlined} from "@ant-design/icons"
+import {Button, Input, Tag, Tooltip, Typography} from "antd"
 import type {FormInstance} from "antd/es/form"
 import {useAtomValue} from "jotai"
-import {createUseStyles} from "react-jss"
 
 import {extractJsonPaths, safeParseJson} from "@/oss/lib/helpers/extractJsonPaths"
-import type {JSSTheme} from "@/oss/lib/Types"
 
 import {playgroundSelectedTestcaseAtom} from "./state/atoms"
 
 const {Text} = Typography
 
-const useStyles = createUseStyles((theme: JSSTheme) => ({
-    container: {
-        display: "flex",
-        flexDirection: "column",
-        gap: 12,
-    },
-    tagsContainer: {
-        display: "flex",
-        flexWrap: "wrap",
-        gap: 8,
-        padding: 12,
-        borderRadius: 6,
-        border: `1px solid ${theme.colorBorder}`,
-        backgroundColor: theme.colorBgContainer,
-        minHeight: 48,
-    },
-    fieldTag: {
-        display: "flex",
-        alignItems: "center",
-        fontFamily: "monospace",
-        fontSize: 13,
-        margin: 0,
-    },
-    matchRatioTag: {
-        fontFamily: "monospace",
-        fontSize: 13,
-        margin: 0,
-        fontWeight: 500,
-    },
-    addFieldRow: {
-        display: "flex",
-        gap: 8,
-    },
-    addInput: {
-        flex: 1,
-        fontFamily: "monospace",
-    },
-    actionsRow: {
-        display: "flex",
-        alignItems: "center",
-        justifyContent: "space-between",
-    },
-    helperText: {
-        fontSize: 12,
-        color: theme.colorTextSecondary,
-    },
-    emptyMessage: {
-        color: theme.colorTextSecondary,
-        fontSize: 13,
-    },
-}))
-
 interface FieldsTagsEditorProps {
     value?: string[]
     onChange?: (value: string[]) => void
@@ -99,8 +45,6 @@ export const FieldsTagsEditor: React.FC<FieldsTagsEditorProps> = ({
     form,
     correctAnswerKey = "correct_answer",
 }) => {
-    const classes = useStyles()
-    const {token} = theme.useToken()
     const [inputValue, setInputValue] = useState("")
     // Track if we've already auto-detected to avoid re-triggering
     const hasAutoDetectedRef = useRef(false)
@@ -203,12 +147,12 @@ export const FieldsTagsEditor: React.FC<FieldsTagsEditorProps> = ({
     }, [testcase, canDetectFields, effectiveKey, detectableFields.length])
 
     return (
-        <div className={classes.container}>
+        <div className="flex flex-col gap-3">
             {/* Field Tags Display */}
-            <div className={classes.tagsContainer}>
+            <div className="flex flex-wrap gap-2 p-3 rounded-md border border-solid border-[var(--ant-color-border)] bg-[var(--ant-color-bg-container)] min-h-[48px]">
                 {/* Non-removable aggregate_score tag */}
                 <Tooltip title="Aggregate score across all fields (auto-generated)">
-                    <Tag color="success" className={classes.matchRatioTag}>
+                    <Tag color="success" className="font-mono text-[13px] !m-0 font-medium">
                         aggregate_score
                     </Tag>
                 </Tooltip>
@@ -219,7 +163,7 @@ export const FieldsTagsEditor: React.FC<FieldsTagsEditorProps> = ({
                         key={field}
                         closable
                         onClose={() => handleRemoveField(field)}
-                        className={classes.fieldTag}
+                        className="flex items-center font-mono text-[13px] !m-0"
                     >
                         {field}
                     </Tag>
@@ -227,23 +171,23 @@ export const FieldsTagsEditor: React.FC<FieldsTagsEditorProps> = ({
 
                 {/* Empty state message */}
                 {value.length === 0 && (
-                    <Text className={classes.emptyMessage}>
+                    <Text className="text-[var(--ant-color-text-secondary)] text-[13px]">
                         Add fields to compare or detect them from a testcase
                     </Text>
                 )}
             </div>
 
             {/* Add Field Input */}
-            <div className={classes.addFieldRow}>
+            <div className="flex gap-2">
                 <Input
-                    className={classes.addInput}
+                    className="flex-1 font-mono"
                     placeholder="Add field (e.g., name or user.address.city)"
                     value={inputValue}
                     onChange={(e) => setInputValue(e.target.value)}
                     onKeyDown={handleInputKeyDown}
                     suffix={
                         <Tooltip title="Use dot notation for nested fields (e.g., user.name)">
-                            <Text type="secondary" style={{fontSize: 11}}>
+                            <Text type="secondary" className="text-[11px]">
                                 ?
                             </Text>
                         </Tooltip>
@@ -259,8 +203,8 @@ export const FieldsTagsEditor: React.FC<FieldsTagsEditorProps> = ({
             </div>
 
             {/* Actions Row */}
-            <div className={classes.actionsRow}>
-                <Text className={classes.helperText}>
+            <div className="flex items-center justify-between">
+                <Text className="text-xs text-[var(--ant-color-text-secondary)]">
                     Each field creates a column with value 0 (no match) or 1 (match)
                 </Text>
 

From 0df3ddc1c744563be308d678c2a6213f65f52e8c Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Tue, 30 Dec 2025 21:03:23 +0100
Subject: [PATCH 08/11] refactor(api): clean up unused imports and improve
 parameter checks in workflow handlers

---
 api/oss/src/core/evaluators/service.py | 10 ----
 sdk/agenta/sdk/workflows/handlers.py   | 71 ++++++++++++--------------
 2 files changed, 33 insertions(+), 48 deletions(-)

diff --git a/api/oss/src/core/evaluators/service.py b/api/oss/src/core/evaluators/service.py
index 076e82439a..5a2b97250c 100644
--- a/api/oss/src/core/evaluators/service.py
+++ b/api/oss/src/core/evaluators/service.py
@@ -1,13 +1,9 @@
 from typing import Optional, List
 from uuid import UUID, uuid4
-from json import loads
 
 from oss.src.utils.helpers import get_slug_from_name_and_id
 from oss.src.services.db_manager import fetch_evaluator_config
 from oss.src.core.workflows.dtos import (
-    WorkflowFlags,
-    WorkflowQueryFlags,
-    #
     WorkflowCreate,
     WorkflowEdit,
     WorkflowQuery,
@@ -17,8 +13,6 @@
     WorkflowVariantEdit,
     WorkflowVariantQuery,
     #
-    WorkflowRevisionData,
-    #
     WorkflowRevisionCreate,
     WorkflowRevisionEdit,
     WorkflowRevisionCommit,
@@ -35,11 +29,7 @@
     SimpleEvaluatorEdit,
     SimpleEvaluatorQuery,
     SimpleEvaluatorFlags,
-    SimpleEvaluatorQueryFlags,
-    #
     EvaluatorFlags,
-    EvaluatorQueryFlags,
-    #
     Evaluator,
     EvaluatorQuery,
     EvaluatorRevisionsLog,
diff --git a/sdk/agenta/sdk/workflows/handlers.py b/sdk/agenta/sdk/workflows/handlers.py
index 74831ddb2b..dc2e7cd20e 100644
--- a/sdk/agenta/sdk/workflows/handlers.py
+++ b/sdk/agenta/sdk/workflows/handlers.py
@@ -1,4 +1,4 @@
-from typing import List, Any, Optional, Any, Dict, Union
+from typing import List, Optional, Any, Dict, Union
 from json import dumps, loads
 import traceback
 import json
@@ -22,7 +22,6 @@
 from agenta.sdk.decorators.tracing import instrument
 
 from agenta.sdk.models.shared import Data
-from agenta.sdk.models.tracing import Trace
 from agenta.sdk.workflows.sandbox import execute_code_safely
 from agenta.sdk.workflows.errors import (
     InvalidConfigurationParametersV0Error,
@@ -32,7 +31,6 @@
     MissingInputV0Error,
     InvalidInputV0Error,
     InvalidOutputsV0Error,
-    MissingOutputV0Error,
     InvalidSecretsV0Error,
     JSONDiffV0Error,
     LevenshteinDistanceV0Error,
@@ -46,7 +44,6 @@
     PromptCompletionV0Error,
 )
 
-from agenta.sdk.litellm import mockllm
 from agenta.sdk.litellm.litellm import litellm_handler
 
 litellm.logging = False
@@ -76,9 +73,7 @@ def _compute_similarity(embedding_1: List[float], embedding_2: List[float]) -> f
     return dot / (norm1 * norm2)
 
 
-import json
-import re
-from typing import Any, Dict, Iterable, Tuple, Optional
+from typing import Any, Iterable, Tuple
 
 try:
     import jsonpath  # ✅ use module API
@@ -389,7 +384,7 @@ def auto_exact_match_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "correct_answer_key" in parameters:
+    if "correct_answer_key" not in parameters:
         raise MissingConfigurationParameterV0Error(path="correct_answer_key")
 
     correct_answer_key = str(parameters["correct_answer_key"])
@@ -397,7 +392,7 @@ def auto_exact_match_v0(
     if inputs is None or not isinstance(inputs, dict):
         raise InvalidInputsV0Error(expected="dict", got=inputs)
 
-    if not correct_answer_key in inputs:
+    if correct_answer_key not in inputs:
         raise MissingInputV0Error(path=correct_answer_key)
 
     correct_answer = inputs[correct_answer_key]
@@ -434,7 +429,7 @@ def auto_regex_test_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "regex_pattern" in parameters:
+    if "regex_pattern" not in parameters:
         raise MissingConfigurationParameterV0Error(path="regex_pattern")
 
     regex_pattern = parameters["regex_pattern"]
@@ -492,12 +487,12 @@ def field_match_test_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "json_field" in parameters:
+    if "json_field" not in parameters:
         raise MissingConfigurationParameterV0Error(path="json_field")
 
     json_field = str(parameters["json_field"])
 
-    if not "correct_answer_key" in parameters:
+    if "correct_answer_key" not in parameters:
         raise MissingConfigurationParameterV0Error(path="correct_answer_key")
 
     correct_answer_key = str(parameters["correct_answer_key"])
@@ -505,7 +500,7 @@ def field_match_test_v0(
     if inputs is None or not isinstance(inputs, dict):
         raise InvalidInputsV0Error(expected="dict", got=inputs)
 
-    if not correct_answer_key in inputs:
+    if correct_answer_key not in inputs:
         raise MissingInputV0Error(path=correct_answer_key)
 
     correct_answer = inputs[correct_answer_key]
@@ -518,7 +513,7 @@ def field_match_test_v0(
     if isinstance(outputs, str):
         try:
             outputs_dict = loads(outputs)
-        except json.JSONDecodeError as e:
+        except json.JSONDecodeError:
             # raise InvalidOutputsV0Error(expected="dict", got=outputs) from e
             return {"success": False}
 
@@ -526,7 +521,7 @@ def field_match_test_v0(
         # raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
         return {"success": False}
 
-    if not json_field in outputs_dict:
+    if json_field not in outputs_dict:
         # raise MissingOutputV0Error(path=json_field)
         return {"success": False}
 
@@ -624,7 +619,7 @@ def json_multi_field_match_v0(
     if isinstance(correct_answer, str):
         try:
             expected = json.loads(correct_answer)
-        except json.JSONDecodeError as e:
+        except json.JSONDecodeError:
             raise InvalidInputV0Error(
                 path=correct_answer_key,
                 expected="valid JSON string",
@@ -706,12 +701,12 @@ async def auto_webhook_test_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "webhook_url" in parameters:
+    if "webhook_url" not in parameters:
         raise MissingConfigurationParameterV0Error(path="webhook_url")
 
     webhook_url = str(parameters["webhook_url"])
 
-    if not "correct_answer_key" in parameters:
+    if "correct_answer_key" not in parameters:
         raise MissingConfigurationParameterV0Error(path="correct_answer_key")
 
     correct_answer_key = str(parameters["correct_answer_key"])
@@ -719,7 +714,7 @@ async def auto_webhook_test_v0(
     if inputs is None or not isinstance(inputs, dict):
         raise InvalidInputsV0Error(expected="dict", got=inputs)
 
-    if not correct_answer_key in inputs:
+    if correct_answer_key not in inputs:
         raise MissingInputV0Error(path=correct_answer_key)
 
     correct_answer = inputs[correct_answer_key]
@@ -811,12 +806,12 @@ async def auto_custom_code_run_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "code" in parameters:
+    if "code" not in parameters:
         raise MissingConfigurationParameterV0Error(path="code")
 
     code = str(parameters["code"])
 
-    if not "correct_answer_key" in parameters:
+    if "correct_answer_key" not in parameters:
         raise MissingConfigurationParameterV0Error(path="correct_answer_key")
 
     correct_answer_key = str(parameters["correct_answer_key"])
@@ -824,7 +819,7 @@ async def auto_custom_code_run_v0(
     if inputs is None or not isinstance(inputs, dict):
         raise InvalidInputsV0Error(expected="dict", got=inputs)
 
-    if not correct_answer_key in inputs:
+    if correct_answer_key not in inputs:
         raise MissingInputV0Error(path=correct_answer_key)
 
     correct_answer = inputs[correct_answer_key]
@@ -902,7 +897,7 @@ async def auto_ai_critique_v0(
 
     correct_answer_key = parameters.get("correct_answer_key")
 
-    if not "prompt_template" in parameters:
+    if "prompt_template" not in parameters:
         raise MissingConfigurationParameterV0Error(path="prompt_template")
 
     prompt_template = parameters.get("prompt_template")
@@ -933,7 +928,7 @@ async def auto_ai_critique_v0(
         "json_schema" if template_version == "4" else "text"
     )
 
-    if not response_type in ["text", "json_object", "json_schema"]:
+    if response_type not in ["text", "json_object", "json_schema"]:
         raise InvalidConfigurationParameterV0Error(
             path="response_type",
             expected=["text", "json_object", "json_schema"],
@@ -1135,7 +1130,7 @@ def auto_starts_with_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "prefix" in parameters:
+    if "prefix" not in parameters:
         raise MissingConfigurationParameterV0Error(path="prefix")
 
     prefix = parameters["prefix"]
@@ -1184,7 +1179,7 @@ def auto_ends_with_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "suffix" in parameters:
+    if "suffix" not in parameters:
         raise MissingConfigurationParameterV0Error(path="suffix")
 
     suffix = parameters["suffix"]
@@ -1233,7 +1228,7 @@ def auto_contains_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "substring" in parameters:
+    if "substring" not in parameters:
         raise MissingConfigurationParameterV0Error(path="substring")
 
     substring = parameters["substring"]
@@ -1282,7 +1277,7 @@ def auto_contains_any_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "substrings" in parameters:
+    if "substrings" not in parameters:
         raise MissingConfigurationParameterV0Error(path="substrings")
 
     substrings = parameters["substrings"]
@@ -1340,7 +1335,7 @@ def auto_contains_all_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "substrings" in parameters:
+    if "substrings" not in parameters:
         raise MissingConfigurationParameterV0Error(path="substrings")
 
     substrings = parameters["substrings"]
@@ -1440,7 +1435,7 @@ def auto_json_diff_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "correct_answer_key" in parameters:
+    if "correct_answer_key" not in parameters:
         raise MissingConfigurationParameterV0Error(path="correct_answer_key")
 
     correct_answer_key = str(parameters["correct_answer_key"])
@@ -1448,7 +1443,7 @@ def auto_json_diff_v0(
     if inputs is None or not isinstance(inputs, dict):
         raise InvalidInputsV0Error(expected="dict", got=inputs)
 
-    if not correct_answer_key in inputs:
+    if correct_answer_key not in inputs:
         raise MissingInputV0Error(path=correct_answer_key)
 
     correct_answer = inputs[correct_answer_key]
@@ -1532,7 +1527,7 @@ def auto_levenshtein_distance_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "correct_answer_key" in parameters:
+    if "correct_answer_key" not in parameters:
         raise MissingConfigurationParameterV0Error(path="correct_answer_key")
 
     correct_answer_key = str(parameters["correct_answer_key"])
@@ -1542,7 +1537,7 @@ def auto_levenshtein_distance_v0(
     if inputs is None or not isinstance(inputs, dict):
         raise InvalidInputsV0Error(expected="dict", got=inputs)
 
-    if not correct_answer_key in inputs:
+    if correct_answer_key not in inputs:
         raise MissingInputV0Error(path=correct_answer_key)
 
     correct_answer = inputs[correct_answer_key]
@@ -1637,7 +1632,7 @@ def auto_similarity_match_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "correct_answer_key" in parameters:
+    if "correct_answer_key" not in parameters:
         raise MissingConfigurationParameterV0Error(path="correct_answer_key")
 
     correct_answer_key = str(parameters["correct_answer_key"])
@@ -1647,7 +1642,7 @@ def auto_similarity_match_v0(
     if inputs is None or not isinstance(inputs, dict):
         raise InvalidInputsV0Error(expected="dict", got=inputs)
 
-    if not correct_answer_key in inputs:
+    if correct_answer_key not in inputs:
         raise MissingInputV0Error(path=correct_answer_key)
 
     correct_answer = inputs[correct_answer_key]
@@ -1730,7 +1725,7 @@ async def auto_semantic_similarity_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "correct_answer_key" in parameters:
+    if "correct_answer_key" not in parameters:
         raise MissingConfigurationParameterV0Error(path="correct_answer_key")
 
     correct_answer_key = str(parameters["correct_answer_key"])
@@ -1743,7 +1738,7 @@ async def auto_semantic_similarity_v0(
     if inputs is None or not isinstance(inputs, dict):
         raise InvalidInputsV0Error(expected="dict", got=inputs)
 
-    if not correct_answer_key in inputs:
+    if correct_answer_key not in inputs:
         raise MissingInputV0Error(path=correct_answer_key)
 
     correct_answer = inputs[correct_answer_key]
@@ -1845,7 +1840,7 @@ async def completion_v0(
     if parameters is None or not isinstance(parameters, dict):
         raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
 
-    if not "prompt" in parameters:
+    if "prompt" not in parameters:
         raise MissingConfigurationParameterV0Error(path="prompt")
 
     params: Dict[str, Any] = {**(parameters or {})}

From 04c087ca84752c43368d84a0c8067fd8dd617226 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Tue, 30 Dec 2025 21:04:43 +0100
Subject: [PATCH 09/11] fix(api): use aggregate_score instead of score in
 json_multi_field_match
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes naming inconsistency between API service and SDK interface schema.
The interface defines `aggregate_score` as the required output field,
so the service must use the same name to pass schema validation.

Also applies ruff auto-cleanup for unused imports.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 api/oss/src/services/evaluators_service.py | 32 ++++++++++------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/api/oss/src/services/evaluators_service.py b/api/oss/src/services/evaluators_service.py
index 25505e4029..74902b0a8d 100644
--- a/api/oss/src/services/evaluators_service.py
+++ b/api/oss/src/services/evaluators_service.py
@@ -253,7 +253,7 @@ async def auto_exact_match(
                 message=str(e),
             ),
         )
-    except Exception as e:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,
@@ -487,8 +487,8 @@ async def json_multi_field_match(
         if match:
             matches += 1
 
-    # Overall score is average of field scores
-    results["score"] = matches / len(fields) if fields else 0.0
+    # Aggregate score is the percentage of matching fields
+    results["aggregate_score"] = matches / len(fields) if fields else 0.0
 
     return {"outputs": results}
 
@@ -576,7 +576,7 @@ async def auto_custom_code_run(
             )
         )
         return Result(type="number", value=response["outputs"]["score"])
-    except Exception as e:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,
@@ -645,7 +645,7 @@ async def auto_ai_critique(
             )
         )
         return Result(type="number", value=response["outputs"]["score"])
-    except Exception as e:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,
@@ -656,9 +656,7 @@ async def auto_ai_critique(
         )
 
 
-import json
-import re
-from typing import Any, Dict, Iterable, Tuple, Optional
+from typing import Any, Dict, Iterable, Tuple
 
 try:
     import jsonpath  # ✅ use module API
@@ -1295,7 +1293,7 @@ async def auto_starts_with(
             )
         )
         return Result(type="bool", value=response["outputs"]["success"])
-    except Exception as e:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,
@@ -1337,7 +1335,7 @@ async def auto_ends_with(
         )
         result = Result(type="bool", value=response["outputs"]["success"])
         return result
-    except Exception as e:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,
@@ -1379,7 +1377,7 @@ async def auto_contains(
         )
         result = Result(type="bool", value=response["outputs"]["success"])
         return result
-    except Exception as e:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,
@@ -1421,7 +1419,7 @@ async def auto_contains_any(
         )
         result = Result(type="bool", value=response["outputs"]["success"])
         return result
-    except Exception as e:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,
@@ -1464,7 +1462,7 @@ async def auto_contains_all(
         )
         result = Result(type="bool", value=response["outputs"]["success"])
         return result
-    except Exception as e:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,
@@ -1512,7 +1510,7 @@ async def auto_contains_json(
             input=EvaluatorInputInterface(**{"inputs": {"prediction": output}})
         )
         return Result(type="bool", value=response["outputs"]["success"])
-    except Exception as e:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,
@@ -1530,7 +1528,7 @@ async def contains_json(input: EvaluatorInputInterface) -> EvaluatorOutputInterf
         potential_json = str(input.inputs["prediction"])[start_index:end_index]
         json.loads(potential_json)
         contains_json = True
-    except (ValueError, json.JSONDecodeError) as e:
+    except (ValueError, json.JSONDecodeError):
         contains_json = False
 
     return {"outputs": {"success": contains_json}}
@@ -1992,7 +1990,7 @@ async def auto_levenshtein_distance(
                 message=str(e),
             ),
         )
-    except Exception as e:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,
@@ -2032,7 +2030,7 @@ async def auto_similarity_match(
                 message=str(e),
             ),
         )
-    except Exception as e:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,

From 325d571690414a8e9ebf19c612098ebaeddd28bd Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 31 Dec 2025 15:09:24 +0100
Subject: [PATCH 10/11] refactor(api): enhance get_nested_value function to
 support multiple path formats

Updated the get_nested_value function to utilize resolve_any() for improved path resolution, allowing support for dot notation, JSON Path, and JSON Pointer formats. Cleaned up imports and ensured consistent error handling for path resolution failures.
---
 api/oss/src/services/evaluators_service.py | 59 +++++++---------
 sdk/agenta/sdk/workflows/handlers.py       | 78 +++++++++-------------
 2 files changed, 56 insertions(+), 81 deletions(-)

diff --git a/api/oss/src/services/evaluators_service.py b/api/oss/src/services/evaluators_service.py
index 33a557dce9..04a0fbfc6b 100644
--- a/api/oss/src/services/evaluators_service.py
+++ b/api/oss/src/services/evaluators_service.py
@@ -1,34 +1,31 @@
-import re
 import json
+import re
 import traceback
-from typing import Any, Dict, Union, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
-import litellm
 import httpx
+import litellm
 import numpy as np
-from openai import AsyncOpenAI
+from agenta.sdk.managers.secrets import SecretsManager
 from fastapi import HTTPException
 from numpy._core._multiarray_umath import array
-# from autoevals.ragas import Faithfulness, ContextRelevancy  # Commented out due to autoevals removal
-
-from oss.src.utils.logging import get_module_logger
-from oss.src.services.security import sandbox
-from oss.src.models.shared_models import Error, Result
+from openai import AsyncOpenAI
 from oss.src.models.api.evaluation_model import (
     EvaluatorInputInterface,
-    EvaluatorOutputInterface,
     EvaluatorMappingInputInterface,
     EvaluatorMappingOutputInterface,
+    EvaluatorOutputInterface,
 )
+from oss.src.models.shared_models import Error, Result
+from oss.src.services.security import sandbox
+
+# from autoevals.ragas import Faithfulness, ContextRelevancy  # Commented out due to autoevals removal
+from oss.src.utils.logging import get_module_logger
 from oss.src.utils.traces import (
-    remove_trace_prefix,
-    process_distributed_trace_into_trace_tree,
     get_field_value_from_trace_tree,
+    process_distributed_trace_into_trace_tree,
 )
 
-from agenta.sdk.managers.secrets import SecretsManager
-
-
 log = get_module_logger(__name__)
 
 
@@ -354,35 +351,27 @@ async def field_match_test(input: EvaluatorInputInterface) -> EvaluatorOutputInt
 
 def get_nested_value(obj: Any, path: str) -> Any:
     """
-    Get value from nested dict/object using dot notation (e.g., 'user.address.city').
+    Get value from nested object using resolve_any() with graceful None on failure.
+
+    Supports multiple path formats:
+        - Dot notation: "user.address.city", "items.0.name"
+        - JSON Path: "$.user.address.city", "$.items[0].name"
+        - JSON Pointer: "/user/address/city", "/items/0/name"
 
     Args:
         obj: The object to traverse (dict or nested structure)
-        path: Dot-notation path to the value (e.g., 'user.address.city')
+        path: Path expression in any supported format
 
     Returns:
-        The value at the specified path, or None if path doesn't exist
+        The value at the specified path, or None if path doesn't exist or resolution fails
     """
     if obj is None:
         return None
 
-    keys = path.split(".")
-    value = obj
-
-    for key in keys:
-        if isinstance(value, dict):
-            value = value.get(key)
-        elif isinstance(value, list) and key.isdigit():
-            # Support array indexing with numeric keys
-            idx = int(key)
-            value = value[idx] if 0 <= idx < len(value) else None
-        else:
-            return None
-
-        if value is None:
-            return None
-
-    return value
+    try:
+        return resolve_any(path, obj)
+    except (KeyError, IndexError, ValueError, TypeError, ImportError):
+        return None
 
 
 async def auto_json_multi_field_match(
diff --git a/sdk/agenta/sdk/workflows/handlers.py b/sdk/agenta/sdk/workflows/handlers.py
index dc2e7cd20e..7fc8b0e670 100644
--- a/sdk/agenta/sdk/workflows/handlers.py
+++ b/sdk/agenta/sdk/workflows/handlers.py
@@ -1,50 +1,43 @@
-from typing import List, Optional, Any, Dict, Union
-from json import dumps, loads
-import traceback
 import json
-import re
 import math
+import re
+import traceback
+from difflib import SequenceMatcher
+from json import dumps, loads
+from typing import Any, Dict, List, Optional, Union
 
 import httpx
-
 import litellm
-
-from pydantic import BaseModel, Field
-from openai import AsyncOpenAI, OpenAIError
-from difflib import SequenceMatcher
-
-from agenta.sdk.utils.logging import get_module_logger
-
+from agenta.sdk.decorators.tracing import instrument
 from agenta.sdk.litellm import mockllm
-from agenta.sdk.types import PromptTemplate, Message
+from agenta.sdk.litellm.litellm import litellm_handler
 from agenta.sdk.managers.secrets import SecretsManager
-
-from agenta.sdk.decorators.tracing import instrument
-
 from agenta.sdk.models.shared import Data
-from agenta.sdk.workflows.sandbox import execute_code_safely
+from agenta.sdk.types import Message, PromptTemplate
+from agenta.sdk.utils.logging import get_module_logger
 from agenta.sdk.workflows.errors import (
+    CustomCodeServerV0Error,
     InvalidConfigurationParametersV0Error,
-    MissingConfigurationParameterV0Error,
     InvalidConfigurationParameterV0Error,
     InvalidInputsV0Error,
-    MissingInputV0Error,
     InvalidInputV0Error,
     InvalidOutputsV0Error,
     InvalidSecretsV0Error,
     JSONDiffV0Error,
     LevenshteinDistanceV0Error,
-    SyntacticSimilarityV0Error,
+    MissingConfigurationParameterV0Error,
+    MissingInputV0Error,
+    PromptCompletionV0Error,
+    PromptFormattingV0Error,
+    RegexPatternV0Error,
     SemanticSimilarityV0Error,
-    WebhookServerV0Error,
+    SyntacticSimilarityV0Error,
     WebhookClientV0Error,
-    CustomCodeServerV0Error,
-    RegexPatternV0Error,
-    PromptFormattingV0Error,
-    PromptCompletionV0Error,
+    WebhookServerV0Error,
 )
-
-from agenta.sdk.litellm.litellm import litellm_handler
+from agenta.sdk.workflows.sandbox import execute_code_safely
+from openai import AsyncOpenAI, OpenAIError
+from pydantic import BaseModel, Field
 
 litellm.logging = False
 litellm.set_verbose = False
@@ -534,34 +527,27 @@ def field_match_test_v0(
 
 def _get_nested_value(obj: Any, path: str) -> Any:
     """
-    Get value from nested dict using dot notation path.
+    Get value from nested object using resolve_any() with graceful None on failure.
+
+    Supports multiple path formats:
+        - Dot notation: "user.address.city", "items.0.name"
+        - JSON Path: "$.user.address.city", "$.items[0].name"
+        - JSON Pointer: "/user/address/city", "/items/0/name"
 
     Args:
         obj: The object to traverse (dict or list)
-        path: Dot-separated path like "user.address.city" or "items.0.name"
+        path: Path expression in any supported format
 
     Returns:
-        The value at the path, or None if path doesn't exist
+        The value at the path, or None if path doesn't exist or resolution fails
     """
     if obj is None:
         return None
 
-    keys = path.split(".")
-    value = obj
-
-    for key in keys:
-        if isinstance(value, dict):
-            value = value.get(key)
-        elif isinstance(value, list) and key.isdigit():
-            idx = int(key)
-            value = value[idx] if 0 <= idx < len(value) else None
-        else:
-            return None
-
-        if value is None:
-            return None
-
-    return value
+    try:
+        return resolve_any(path, obj)
+    except (KeyError, IndexError, ValueError, TypeError, ImportError):
+        return None
 
 
 @instrument(annotate=True)

From cc255adb693fc5900038336b2365a09df53151e9 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Mon, 5 Jan 2026 21:51:49 +0100
Subject: [PATCH 11/11] Refactor FieldsTagsEditor to use Form.useWatch for
 correct_answer_key reactivity

- Updated FieldsTagsEditor component to utilize Form.useWatch instead of form.getFieldValue for monitoring changes to correct_answer_key.
- This change enhances reactivity and ensures the component responds appropriately to form updates.
---
 .../ConfigureEvaluator/FieldsTagsEditor.tsx                | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx
index c446c95d4b..a96a07a37f 100644
--- a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx
+++ b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/FieldsTagsEditor.tsx
@@ -17,7 +17,7 @@
 import {useCallback, useEffect, useMemo, useRef, useState} from "react"
 
 import {PlusOutlined, SearchOutlined} from "@ant-design/icons"
-import {Button, Input, Tag, Tooltip, Typography} from "antd"
+import {Button, Form, Input, Tag, Tooltip, Typography} from "antd"
 import type {FormInstance} from "antd/es/form"
 import {useAtomValue} from "jotai"
 
@@ -53,8 +53,9 @@ export const FieldsTagsEditor: React.FC<FieldsTagsEditorProps> = ({
     const testcaseSelection = useAtomValue(playgroundSelectedTestcaseAtom)
     const testcase = testcaseSelection?.testcase
 
-    // Get the correct_answer_key from form if available
-    const formCorrectAnswerKey = form?.getFieldValue(["settings_values", "correct_answer_key"])
+    // Watch the correct_answer_key from form to react to changes
+    // Using Form.useWatch instead of form.getFieldValue for reactivity
+    const formCorrectAnswerKey = Form.useWatch(["settings_values", "correct_answer_key"], form)
     const effectiveKey = formCorrectAnswerKey || correctAnswerKey
 
     // Check if we can detect fields from testcase