Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
f75cb59
adding evaluators (WIP)
jp-agenta Dec 19, 2025
c2c553a
adding evaluators (WIP)
jp-agenta Dec 19, 2025
5a8dcd0
fixing evaluators
jp-agenta Dec 19, 2025
91e69e8
Merge branch 'release/v0.69.5' into chore/check-daytona-code-evaluator
jp-agenta Dec 19, 2025
a602930
testing numpy/openai/agenta
jp-agenta Dec 19, 2025
59f4797
fix typos in init
jp-agenta Dec 19, 2025
6c297c9
confirm works with localhost if public host
jp-agenta Dec 19, 2025
a717366
fix playground
jp-agenta Dec 20, 2025
b3d90f2
fix presets
jp-agenta Dec 20, 2025
5bdc802
remove blaot
jp-agenta Dec 20, 2025
5304e0f
remove bloat
jp-agenta Dec 20, 2025
00958cc
fix daytona imports
jp-agenta Dec 20, 2025
4071a3d
remove openai key from daytona
jp-agenta Dec 20, 2025
7d3ac94
WIP add runtimes
jp-agenta Dec 23, 2025
84bbdaa
Merge branch 'fix/remove-autoevals-and-rag-evaluators' into chore/che…
jp-agenta Dec 23, 2025
a4ffa8c
Merge branch 'main' into chore/check-daytona-code-evaluator
jp-agenta Dec 23, 2025
93d7bb5
WIP
jp-agenta Dec 23, 2025
d3f2a87
Clean up extra logs
jp-agenta Dec 23, 2025
d485e2f
Add/Fix presets
jp-agenta Dec 23, 2025
d3c2af3
ruff format
jp-agenta Dec 23, 2025
a924c3c
Fix editor highlighting
jp-agenta Dec 23, 2025
cc7de34
Apply suggestion from @Copilot
junaway Dec 23, 2025
6b929d0
Apply suggestion from @Copilot
junaway Dec 23, 2025
605e9af
Minor vault fixes
jp-agenta Dec 23, 2025
e6d4803
Merge branch 'chore/check-daytona-code-evaluator' of github.com:Agent…
jp-agenta Dec 23, 2025
08f8903
more vault fix
jp-agenta Dec 23, 2025
90a5896
more vault fixes
jp-agenta Dec 23, 2025
b4a663d
more cleanups
jp-agenta Dec 23, 2025
d960f6e
Apply suggestion from @Copilot
junaway Dec 23, 2025
3498973
Apply suggestion from @Copilot
junaway Dec 23, 2025
908531f
example fixes
jp-agenta Dec 23, 2025
83c80f9
Merge branch 'chore/check-daytona-code-evaluator' of github.com:Agent…
jp-agenta Dec 23, 2025
8301c76
update locks
jp-agenta Dec 23, 2025
7fa102a
fix tabs/spaces conversion
jp-agenta Dec 23, 2025
4b6375e
clearer error printing with daytona
jp-agenta Dec 23, 2025
bf68e6b
apply eslint
jp-agenta Dec 23, 2025
3db392d
apply eslint
jp-agenta Dec 23, 2025
59a6e6b
apply es lint
jp-agenta Dec 23, 2025
cdf1ae0
Merge branch 'frontend-feat/new-testsets-integration' into chore/chec…
jp-agenta Dec 25, 2025
51856c6
fix merge issues
jp-agenta Dec 25, 2025
3966be2
Merge branch 'frontend-feat/new-testsets-integration' into chore/chec…
jp-agenta Dec 25, 2025
8a8d9df
Merge branch 'frontend-feat/new-testsets-integration' into chore/chec…
jp-agenta Dec 26, 2025
18e2e3c
Merge branch 'frontend-feat/new-testsets-integration' into chore/chec…
jp-agenta Dec 30, 2025
9ce5afe
Merge branch 'frontend-feat/new-testsets-integration' into chore/chec…
jp-agenta Jan 2, 2026
d9d6858
ruff format
jp-agenta Jan 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions api/oss/src/apis/fastapi/testsets/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,9 @@ def _serialize_value_for_csv(value: Any) -> Any:
return str(value)


def _prepare_testcases_for_csv(testcases_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
def _prepare_testcases_for_csv(
testcases_data: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Prepare testcases data for CSV export by serializing complex values."""
return [
{key: _serialize_value_for_csv(val) for key, val in row.items()}
Expand Down Expand Up @@ -1085,21 +1087,25 @@ async def fetch_testset_revision_to_file(
include_testcases=True,
)

if not testset_revision_response.count or not testset_revision_response.testset_revision:
if (
not testset_revision_response.count
or not testset_revision_response.testset_revision
):
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Testset revision not found. Please check the revision_id and try again.",
)

revision = testset_revision_response.testset_revision

filename = (file_name or f"revision_{testset_revision_id}") + f".{file_type.lower()}"
filename = (
file_name or f"revision_{testset_revision_id}"
) + f".{file_type.lower()}"
testcases = revision.data.testcases if revision.data else []

# Build export data using helper that properly handles Pydantic models
testcases_data = [
_build_testcase_export_row(testcase)
for testcase in testcases or []
_build_testcase_export_row(testcase) for testcase in testcases or []
]

if file_type.lower() == "json":
Expand Down Expand Up @@ -2040,8 +2046,7 @@ async def fetch_simple_testset_to_file(

# Build export data using helper that properly handles Pydantic models
testcases_data = [
_build_testcase_export_row(testcase)
for testcase in testcases or []
_build_testcase_export_row(testcase) for testcase in testcases or []
]

if file_type.lower() == "json":
Expand Down
3 changes: 3 additions & 0 deletions api/oss/src/core/workflows/dtos.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,9 @@ class WorkflowServiceInterface(WorkflowServiceVersion):
class WorkflowServiceConfiguration(WorkflowServiceInterface):
script: Optional[Data] = None # str w/ validation
parameters: Optional[Data] = None # configuration values
runtime: Optional[str] = (
None # runtime environment (python, javascript, typescript), None = python
)


class WorkflowRevisionData(WorkflowServiceConfiguration):
Expand Down
45 changes: 44 additions & 1 deletion api/oss/src/resources/evaluators/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,41 @@
"name": "Code Evaluation",
"key": "auto_custom_code_run",
"direct_use": False,
"settings_presets": [
{
"key": "python_default",
"name": "Exact Match (Python)",
"values": {
"requires_llm_api_keys": False,
"runtime": "python",
"correct_answer_key": "correct_answer",
"code": "from typing import Dict, Union, Any\n\n\ndef evaluate(\n app_params: Dict[str, str], # deprecated; currently receives {}\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]],\n correct_answer: str,\n) -> float:\n if output == correct_answer:\n return 1.0\n return 0.0\n",
},
"description": "Exact match evaluator implemented in Python.",
},
{
"key": "javascript_default",
"name": "Exact Match (JavaScript)",
"values": {
"requires_llm_api_keys": False,
"runtime": "javascript",
"correct_answer_key": "correct_answer",
"code": 'function evaluate(appParams, inputs, output, correctAnswer) {\n void appParams\n void inputs\n\n const outputStr =\n typeof output === "string" ? output : JSON.stringify(output)\n\n return outputStr === String(correctAnswer) ? 1.0 : 0.0\n}\n',
},
"description": "Exact match evaluator implemented in JavaScript.",
},
{
"key": "typescript_default",
"name": "Exact Match (TypeScript)",
"values": {
"requires_llm_api_keys": False,
"runtime": "typescript",
"correct_answer_key": "correct_answer",
"code": 'type OutputValue = string | Record<string, unknown>\n\nfunction evaluate(\n app_params: Record<string, string>,\n inputs: Record<string, string>,\n output: OutputValue,\n correct_answer: string\n): number {\n void app_params\n void inputs\n\n const outputStr =\n (typeof output === "string" ? output : JSON.stringify(output)) as string\n\n return outputStr === String(correct_answer) ? 1.0 : 0.0\n}\n',
},
Comment on lines +309 to +332
Copy link

Copilot AI Jan 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The preset code values are stored as long single-line strings with embedded newlines (\n). This makes the code difficult to read and maintain in the resource file. Consider using multiline strings or loading these presets from separate files to improve readability and maintainability of the evaluator preset code.

Copilot uses AI. Check for mistakes.
"description": "Exact match evaluator implemented in TypeScript.",
},
],
"settings_template": {
"requires_llm_api_keys": {
"label": "Requires LLM API Key(s)",
Expand All @@ -310,10 +345,18 @@
"code": {
"label": "Evaluation Code",
"type": "code",
"default": "from typing import Dict, Union, Any\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]], # output of the llm app\n correct_answer: str # contains the testset row \n) -> float:\n if output in correct_answer:\n return 1.0\n else:\n return 0.0\n",
"default": "from typing import Dict, Union, Any\n\n\ndef evaluate(\n app_params: Dict[str, str], # deprecated; currently receives {}\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]],\n correct_answer: str,\n) -> float:\n if output == correct_answer:\n return 1.0\n return 0.0\n",
"description": "Code for evaluating submissions",
"required": True,
},
"runtime": {
"label": "Runtime",
"type": "multiple_choice",
"default": "python",
"options": ["python", "javascript", "typescript"],
"advanced": True,
"description": "Runtime environment used to execute the evaluator code.",
},
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
Expand Down
13 changes: 7 additions & 6 deletions api/oss/src/routers/evaluators_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,16 @@ async def evaluator_run(
workspace_id=str(request.state.workspace_id),
organization_id=str(request.state.organization_id),
)
credentials = f"Secret {secret_token}"

with tracing_context_manager(TracingContext.get()):
tracing_ctx = TracingContext.get()
tracing_ctx.credentials = f"Secret {secret_token}"
tracing_ctx = TracingContext.get()
tracing_ctx.credentials = credentials

with running_context_manager(RunningContext.get()):
running_ctx = RunningContext.get()
running_ctx.credentials = f"Secret {secret_token}"
ctx = RunningContext.get()
ctx.credentials = credentials

with tracing_context_manager(tracing_ctx):
Comment on lines +103 to +109
Copy link

Copilot AI Dec 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The context objects are retrieved and modified before being passed to context managers. This pattern could lead to issues if the contexts are modified elsewhere between get() and the context manager entry. Consider retrieving fresh contexts inside the managers or ensuring contexts are isolated.

Copilot uses AI. Check for mistakes.
with running_context_manager(ctx):
try:
result = await evaluators_service.run(
evaluator_key=evaluator_key,
Expand Down
58 changes: 57 additions & 1 deletion api/oss/src/services/evaluators_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,15 @@
get_field_value_from_trace_tree,
)

from agenta.sdk.contexts.running import RunningContext
from agenta.sdk.managers.secrets import SecretsManager
from agenta.sdk.models.workflows import (
WorkflowServiceRequest,
WorkflowServiceRequestData,
)
from agenta.sdk.workflows.builtin import (
auto_custom_code_run as sdk_auto_custom_code_run,
)


log = get_module_logger(__name__)
Expand Down Expand Up @@ -458,6 +466,54 @@ async def custom_code_run(input: EvaluatorInputInterface) -> EvaluatorOutputInte
return {"outputs": {"score": result}}


async def sdk_custom_code_run(
input: EvaluatorInputInterface,
) -> EvaluatorOutputInterface:
inputs = input.inputs or {}
settings = input.settings or {}

code = settings.get("code")
if code is None:
raise ValueError("Missing evaluator setting: code")

correct_answer_key = settings.get("correct_answer_key")
if not correct_answer_key:
correct_answer_key = (
"ground_truth" if "ground_truth" in inputs else "correct_answer"
)

threshold = settings.get("threshold", 0.5)
runtime = settings.get("runtime")

workflow = sdk_auto_custom_code_run(
code=str(code),
correct_answer_key=str(correct_answer_key),
threshold=float(threshold),
runtime=runtime,
)

credentials = RunningContext.get().credentials

outputs = inputs.get("prediction", inputs.get("output"))
request = WorkflowServiceRequest(
data=WorkflowServiceRequestData(
inputs=inputs,
outputs=outputs,
),
credentials=credentials,
)

response = await workflow.invoke(request=request)
result = response.data.outputs if response.data else None

if isinstance(result, dict) and "score" in result:
score = result["score"]
else:
score = result

return {"outputs": {"score": score}}


async def auto_ai_critique(
inputs: Dict[str, Any],
output: Union[str, Dict[str, Any]],
Expand Down Expand Up @@ -2025,7 +2081,7 @@ async def auto_semantic_similarity(
"auto_regex_test": regex_test,
"field_match_test": field_match_test,
"auto_webhook_test": webhook_test,
"auto_custom_code_run": custom_code_run,
"auto_custom_code_run": sdk_custom_code_run,
"auto_ai_critique": ai_critique,
"auto_starts_with": starts_with,
"auto_ends_with": ends_with,
Expand Down
22 changes: 22 additions & 0 deletions examples/javascript/evaluators/basic/default_preset.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/**
* Character Count Match Test (JavaScript)
* ======================================
*
* Simple evaluator that compares character counts for output vs correct answer.
* This mirrors the Python exact_match example without NumPy.
*/

function evaluate(appParams, inputs, output, correctAnswer) {
void appParams
void inputs

try {
const outputStr =
typeof output === "string" ? output : JSON.stringify(output)
const answerStr = String(correctAnswer)

return outputStr.length === answerStr.length ? 1.0 : 0.0
} catch {
return 0.0
}
}
Empty file.
Empty file.
56 changes: 56 additions & 0 deletions examples/python/evaluators/ag/configs_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""
Agenta Config Endpoint Test
============================
Tests Agenta config endpoint availability using requests.
"""

from typing import Dict, Union, Any
import os


def evaluate(
app_params: Dict[str, str],
inputs: Dict[str, str],
output: Union[str, Dict[str, Any]],
correct_answer: str,
) -> float:
try:
import requests
except ImportError:
return 0.0

try:
host = os.environ.get("AGENTA_HOST")
credentials = os.environ.get("AGENTA_CREDENTIALS")

if not host:
return 0.6

if not credentials:
return 0.601

headers = dict(
Authorization=credentials,
)

refs = dict(
application_ref=dict(
slug="prompt",
),
environment_ref=dict(
slug="development",
),
)

response = requests.post(
f"{host}/api/variants/configs/fetch",
headers=headers,
json=refs,
timeout=10,
)

return float(response.status_code) / 1000.0

except Exception:
return 0.602
37 changes: 37 additions & 0 deletions examples/python/evaluators/ag/health_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
Agenta Health Endpoint Test
============================
Tests Agenta API health endpoint availability using requests.
"""

from typing import Dict, Union, Any
import os


def evaluate(
app_params: Dict[str, str],
inputs: Dict[str, str],
output: Union[str, Dict[str, Any]],
correct_answer: str,
) -> float:
try:
import requests
except ImportError:
return 0.0

try:
host = os.environ.get("AGENTA_HOST")

if not host:
return 0.6

response = requests.get(
f"{host}/api/health",
timeout=10,
)

return float(response.status_code) / 1000.0

except Exception:
return 0.602
46 changes: 46 additions & 0 deletions examples/python/evaluators/ag/secrets_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
Agenta Secrets Endpoint Test
=============================
Tests Agenta secrets endpoint availability using requests.
"""

from typing import Dict, Union, Any
import os


def evaluate(
app_params: Dict[str, str],
inputs: Dict[str, str],
output: Union[str, Dict[str, Any]],
correct_answer: str,
) -> float:
try:
import requests
except ImportError:
return 0.0

try:
host = os.environ.get("AGENTA_HOST")
credentials = os.environ.get("AGENTA_CREDENTIALS")

if not host:
return 0.6

if not credentials:
return 0.601

headers = dict(
Authorization=credentials,
)

response = requests.get(
f"{host}/api/vault/v1/secrets/",
headers=headers,
timeout=10,
)

return float(response.status_code) / 1000.0

except Exception:
return 0.602
Loading
Loading