Agenta-AI · junaway · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
diff --git a/api/oss/src/apis/fastapi/testsets/router.py b/api/oss/src/apis/fastapi/testsets/router.py
@@ -188,7 +188,9 @@ def _serialize_value_for_csv(value: Any) -> Any:
     return str(value)
 
 
-def _prepare_testcases_for_csv(testcases_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+def _prepare_testcases_for_csv(
+    testcases_data: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
     """Prepare testcases data for CSV export by serializing complex values."""
     return [
         {key: _serialize_value_for_csv(val) for key, val in row.items()}
@@ -1085,21 +1087,25 @@ async def fetch_testset_revision_to_file(
             include_testcases=True,
         )
 
-        if not testset_revision_response.count or not testset_revision_response.testset_revision:
+        if (
+            not testset_revision_response.count
+            or not testset_revision_response.testset_revision
+        ):
             raise HTTPException(
                 status_code=status.HTTP_404_NOT_FOUND,
                 detail="Testset revision not found. Please check the revision_id and try again.",
             )
 
         revision = testset_revision_response.testset_revision
 
-        filename = (file_name or f"revision_{testset_revision_id}") + f".{file_type.lower()}"
+        filename = (
+            file_name or f"revision_{testset_revision_id}"
+        ) + f".{file_type.lower()}"
         testcases = revision.data.testcases if revision.data else []
 
         # Build export data using helper that properly handles Pydantic models
         testcases_data = [
-            _build_testcase_export_row(testcase)
-            for testcase in testcases or []
+            _build_testcase_export_row(testcase) for testcase in testcases or []
         ]
 
         if file_type.lower() == "json":
@@ -2040,8 +2046,7 @@ async def fetch_simple_testset_to_file(
 
         # Build export data using helper that properly handles Pydantic models
         testcases_data = [
-            _build_testcase_export_row(testcase)
-            for testcase in testcases or []
+            _build_testcase_export_row(testcase) for testcase in testcases or []
         ]
 
         if file_type.lower() == "json":

diff --git a/api/oss/src/core/workflows/dtos.py b/api/oss/src/core/workflows/dtos.py
@@ -181,6 +181,9 @@ class WorkflowServiceInterface(WorkflowServiceVersion):
 class WorkflowServiceConfiguration(WorkflowServiceInterface):
     script: Optional[Data] = None  # str w/ validation
     parameters: Optional[Data] = None  # configuration values
+    runtime: Optional[str] = (
+        None  # runtime environment (python, javascript, typescript), None = python
+    )
 
 
 class WorkflowRevisionData(WorkflowServiceConfiguration):

diff --git a/api/oss/src/resources/evaluators/evaluators.py b/api/oss/src/resources/evaluators/evaluators.py
@@ -298,6 +298,41 @@
         "name": "Code Evaluation",
         "key": "auto_custom_code_run",
         "direct_use": False,
+        "settings_presets": [
+            {
+                "key": "python_default",
+                "name": "Exact Match (Python)",
+                "values": {
+                    "requires_llm_api_keys": False,
+                    "runtime": "python",
+                    "correct_answer_key": "correct_answer",
+                    "code": "from typing import Dict, Union, Any\n\n\ndef evaluate(\n    app_params: Dict[str, str],  # deprecated; currently receives {}\n    inputs: Dict[str, str],\n    output: Union[str, Dict[str, Any]],\n    correct_answer: str,\n) -> float:\n    if output == correct_answer:\n        return 1.0\n    return 0.0\n",
+                },
+                "description": "Exact match evaluator implemented in Python.",
+            },
+            {
+                "key": "javascript_default",
+                "name": "Exact Match (JavaScript)",
+                "values": {
+                    "requires_llm_api_keys": False,
+                    "runtime": "javascript",
+                    "correct_answer_key": "correct_answer",
+                    "code": 'function evaluate(appParams, inputs, output, correctAnswer) {\n  void appParams\n  void inputs\n\n  const outputStr =\n    typeof output === "string" ? output : JSON.stringify(output)\n\n  return outputStr === String(correctAnswer) ? 1.0 : 0.0\n}\n',
+                },
+                "description": "Exact match evaluator implemented in JavaScript.",
+            },
+            {
+                "key": "typescript_default",
+                "name": "Exact Match (TypeScript)",
+                "values": {
+                    "requires_llm_api_keys": False,
+                    "runtime": "typescript",
+                    "correct_answer_key": "correct_answer",
+                    "code": 'type OutputValue = string | Record<string, unknown>\n\nfunction evaluate(\n  app_params: Record<string, string>,\n  inputs: Record<string, string>,\n  output: OutputValue,\n  correct_answer: string\n): number {\n  void app_params\n  void inputs\n\n  const outputStr =\n    (typeof output === "string" ? output : JSON.stringify(output)) as string\n\n  return outputStr === String(correct_answer) ? 1.0 : 0.0\n}\n',
+                },
+                "description": "Exact match evaluator implemented in TypeScript.",
+            },
+        ],
         "settings_template": {
             "requires_llm_api_keys": {
                 "label": "Requires LLM API Key(s)",
@@ -310,10 +345,18 @@
             "code": {
                 "label": "Evaluation Code",
                 "type": "code",
-                "default": "from typing import Dict, Union, Any\n\ndef evaluate(\n    app_params: Dict[str, str],\n    inputs: Dict[str, str],\n    output: Union[str, Dict[str, Any]], # output of the llm app\n    correct_answer: str # contains the testset row \n) -> float:\n    if output in correct_answer:\n        return 1.0\n    else:\n        return 0.0\n",
+                "default": "from typing import Dict, Union, Any\n\n\ndef evaluate(\n    app_params: Dict[str, str],  # deprecated; currently receives {}\n    inputs: Dict[str, str],\n    output: Union[str, Dict[str, Any]],\n    correct_answer: str,\n) -> float:\n    if output == correct_answer:\n        return 1.0\n    return 0.0\n",
                 "description": "Code for evaluating submissions",
                 "required": True,
             },
+            "runtime": {
+                "label": "Runtime",
+                "type": "multiple_choice",
+                "default": "python",
+                "options": ["python", "javascript", "typescript"],
+                "advanced": True,
+                "description": "Runtime environment used to execute the evaluator code.",
+            },
             "correct_answer_key": {
                 "label": "Expected Answer Column",
                 "default": "correct_answer",

diff --git a/api/oss/src/routers/evaluators_router.py b/api/oss/src/routers/evaluators_router.py
@@ -98,15 +98,16 @@ async def evaluator_run(
         workspace_id=str(request.state.workspace_id),
         organization_id=str(request.state.organization_id),
     )
+    credentials = f"Secret {secret_token}"
 
-    with tracing_context_manager(TracingContext.get()):
-        tracing_ctx = TracingContext.get()
-        tracing_ctx.credentials = f"Secret {secret_token}"
+    tracing_ctx = TracingContext.get()
+    tracing_ctx.credentials = credentials
 
-        with running_context_manager(RunningContext.get()):
-            running_ctx = RunningContext.get()
-            running_ctx.credentials = f"Secret {secret_token}"
+    ctx = RunningContext.get()
+    ctx.credentials = credentials
 
+    with tracing_context_manager(tracing_ctx):
+        with running_context_manager(ctx):
             try:
                 result = await evaluators_service.run(
                     evaluator_key=evaluator_key,

diff --git a/api/oss/src/services/evaluators_service.py b/api/oss/src/services/evaluators_service.py
@@ -26,7 +26,15 @@
     get_field_value_from_trace_tree,
 )
 
+from agenta.sdk.contexts.running import RunningContext
 from agenta.sdk.managers.secrets import SecretsManager
+from agenta.sdk.models.workflows import (
+    WorkflowServiceRequest,
+    WorkflowServiceRequestData,
+)
+from agenta.sdk.workflows.builtin import (
+    auto_custom_code_run as sdk_auto_custom_code_run,
+)
 
 
 log = get_module_logger(__name__)
@@ -458,6 +466,54 @@ async def custom_code_run(input: EvaluatorInputInterface) -> EvaluatorOutputInte
     return {"outputs": {"score": result}}
 
 
+async def sdk_custom_code_run(
+    input: EvaluatorInputInterface,
+) -> EvaluatorOutputInterface:
+    inputs = input.inputs or {}
+    settings = input.settings or {}
+
+    code = settings.get("code")
+    if code is None:
+        raise ValueError("Missing evaluator setting: code")
+
+    correct_answer_key = settings.get("correct_answer_key")
+    if not correct_answer_key:
+        correct_answer_key = (
+            "ground_truth" if "ground_truth" in inputs else "correct_answer"
+        )
+
+    threshold = settings.get("threshold", 0.5)
+    runtime = settings.get("runtime")
+
+    workflow = sdk_auto_custom_code_run(
+        code=str(code),
+        correct_answer_key=str(correct_answer_key),
+        threshold=float(threshold),
+        runtime=runtime,
+    )
+
+    credentials = RunningContext.get().credentials
+
+    outputs = inputs.get("prediction", inputs.get("output"))
+    request = WorkflowServiceRequest(
+        data=WorkflowServiceRequestData(
+            inputs=inputs,
+            outputs=outputs,
+        ),
+        credentials=credentials,
+    )
+
+    response = await workflow.invoke(request=request)
+    result = response.data.outputs if response.data else None
+
+    if isinstance(result, dict) and "score" in result:
+        score = result["score"]
+    else:
+        score = result
+
+    return {"outputs": {"score": score}}
+
+
 async def auto_ai_critique(
     inputs: Dict[str, Any],
     output: Union[str, Dict[str, Any]],
@@ -2025,7 +2081,7 @@ async def auto_semantic_similarity(
     "auto_regex_test": regex_test,
     "field_match_test": field_match_test,
     "auto_webhook_test": webhook_test,
-    "auto_custom_code_run": custom_code_run,
+    "auto_custom_code_run": sdk_custom_code_run,
     "auto_ai_critique": ai_critique,
     "auto_starts_with": starts_with,
     "auto_ends_with": ends_with,

diff --git a/examples/javascript/evaluators/basic/default_preset.js b/examples/javascript/evaluators/basic/default_preset.js
@@ -0,0 +1,22 @@
+/**
+ * Character Count Match Test (JavaScript)
+ * ======================================
+ *
+ * Simple evaluator that compares character counts for output vs correct answer.
+ * This mirrors the Python exact_match example without NumPy.
+ */
+
+function evaluate(appParams, inputs, output, correctAnswer) {
+  void appParams
+  void inputs
+
+  try {
+    const outputStr =
+      typeof output === "string" ? output : JSON.stringify(output)
+    const answerStr = String(correctAnswer)
+
+    return outputStr.length === answerStr.length ? 1.0 : 0.0
+  } catch {
+    return 0.0
+  }
+}
diff --git a/examples/python/evaluators/__init__.py b/examples/python/evaluators/__init__.py
diff --git a/examples/python/evaluators/ag/__init__.py b/examples/python/evaluators/ag/__init__.py
diff --git a/examples/python/evaluators/ag/configs_check.py b/examples/python/evaluators/ag/configs_check.py
@@ -0,0 +1,56 @@
+"""
+Agenta Config Endpoint Test
+============================
+
+Tests Agenta config endpoint availability using requests.
+"""
+
+from typing import Dict, Union, Any
+import os
+
+
+def evaluate(
+    app_params: Dict[str, str],
+    inputs: Dict[str, str],
+    output: Union[str, Dict[str, Any]],
+    correct_answer: str,
+) -> float:
+    try:
+        import requests
+    except ImportError:
+        return 0.0
+
+    try:
+        host = os.environ.get("AGENTA_HOST")
+        credentials = os.environ.get("AGENTA_CREDENTIALS")
+
+        if not host:
+            return 0.6
+
+        if not credentials:
+            return 0.601
+
+        headers = dict(
+            Authorization=credentials,
+        )
+
+        refs = dict(
+            application_ref=dict(
+                slug="prompt",
+            ),
+            environment_ref=dict(
+                slug="development",
+            ),
+        )
+
+        response = requests.post(
+            f"{host}/api/variants/configs/fetch",
+            headers=headers,
+            json=refs,
+            timeout=10,
+        )
+
+        return float(response.status_code) / 1000.0
+
+    except Exception:
+        return 0.602
diff --git a/examples/python/evaluators/ag/health_check.py b/examples/python/evaluators/ag/health_check.py
@@ -0,0 +1,37 @@
+"""
+Agenta Health Endpoint Test
+============================
+
+Tests Agenta API health endpoint availability using requests.
+"""
+
+from typing import Dict, Union, Any
+import os
+
+
+def evaluate(
+    app_params: Dict[str, str],
+    inputs: Dict[str, str],
+    output: Union[str, Dict[str, Any]],
+    correct_answer: str,
+) -> float:
+    try:
+        import requests
+    except ImportError:
+        return 0.0
+
+    try:
+        host = os.environ.get("AGENTA_HOST")
+
+        if not host:
+            return 0.6
+
+        response = requests.get(
+            f"{host}/api/health",
+            timeout=10,
+        )
+
+        return float(response.status_code) / 1000.0
+
+    except Exception:
+        return 0.602
diff --git a/examples/python/evaluators/ag/secrets_check.py b/examples/python/evaluators/ag/secrets_check.py
@@ -0,0 +1,46 @@
+"""
+Agenta Secrets Endpoint Test
+=============================
+
+Tests Agenta secrets endpoint availability using requests.
+"""
+
+from typing import Dict, Union, Any
+import os
+
+
+def evaluate(
+    app_params: Dict[str, str],
+    inputs: Dict[str, str],
+    output: Union[str, Dict[str, Any]],
+    correct_answer: str,
+) -> float:
+    try:
+        import requests
+    except ImportError:
+        return 0.0
+
+    try:
+        host = os.environ.get("AGENTA_HOST")
+        credentials = os.environ.get("AGENTA_CREDENTIALS")
+
+        if not host:
+            return 0.6
+
+        if not credentials:
+            return 0.601
+
+        headers = dict(
+            Authorization=credentials,
+        )
+
+        response = requests.get(
+            f"{host}/api/vault/v1/secrets/",
+            headers=headers,
+            timeout=10,
+        )
+
+        return float(response.status_code) / 1000.0
+
+    except Exception:
+        return 0.602