-
Notifications
You must be signed in to change notification settings - Fork 432
[feat] Add DaytonaRunner for code evaluators
#3258
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: frontend-feat/new-testsets-integration
Are you sure you want to change the base?
Changes from all commits
f75cb59
c2c553a
5a8dcd0
91e69e8
a602930
59f4797
6c297c9
a717366
b3d90f2
5bdc802
5304e0f
00958cc
4071a3d
7d3ac94
84bbdaa
a4ffa8c
93d7bb5
d3f2a87
d485e2f
d3c2af3
a924c3c
cc7de34
6b929d0
605e9af
e6d4803
08f8903
90a5896
b4a663d
d960f6e
3498973
908531f
83c80f9
8301c76
7fa102a
4b6375e
bf68e6b
3db392d
59a6e6b
cdf1ae0
51856c6
3966be2
8a8d9df
18e2e3c
9ce5afe
d9d6858
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -298,6 +298,41 @@ | |
| "name": "Code Evaluation", | ||
| "key": "auto_custom_code_run", | ||
| "direct_use": False, | ||
| "settings_presets": [ | ||
| { | ||
| "key": "python_default", | ||
| "name": "Exact Match (Python)", | ||
| "values": { | ||
| "requires_llm_api_keys": False, | ||
| "runtime": "python", | ||
| "correct_answer_key": "correct_answer", | ||
| "code": "from typing import Dict, Union, Any\n\n\ndef evaluate(\n app_params: Dict[str, str], # deprecated; currently receives {}\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]],\n correct_answer: str,\n) -> float:\n if output == correct_answer:\n return 1.0\n return 0.0\n", | ||
| }, | ||
| "description": "Exact match evaluator implemented in Python.", | ||
| }, | ||
| { | ||
| "key": "javascript_default", | ||
| "name": "Exact Match (JavaScript)", | ||
| "values": { | ||
| "requires_llm_api_keys": False, | ||
| "runtime": "javascript", | ||
| "correct_answer_key": "correct_answer", | ||
| "code": 'function evaluate(appParams, inputs, output, correctAnswer) {\n void appParams\n void inputs\n\n const outputStr =\n typeof output === "string" ? output : JSON.stringify(output)\n\n return outputStr === String(correctAnswer) ? 1.0 : 0.0\n}\n', | ||
| }, | ||
| "description": "Exact match evaluator implemented in JavaScript.", | ||
| }, | ||
| { | ||
| "key": "typescript_default", | ||
| "name": "Exact Match (TypeScript)", | ||
| "values": { | ||
| "requires_llm_api_keys": False, | ||
| "runtime": "typescript", | ||
| "correct_answer_key": "correct_answer", | ||
| "code": 'type OutputValue = string | Record<string, unknown>\n\nfunction evaluate(\n app_params: Record<string, string>,\n inputs: Record<string, string>,\n output: OutputValue,\n correct_answer: string\n): number {\n void app_params\n void inputs\n\n const outputStr =\n (typeof output === "string" ? output : JSON.stringify(output)) as string\n\n return outputStr === String(correct_answer) ? 1.0 : 0.0\n}\n', | ||
junaway marked this conversation as resolved.
Show resolved
Hide resolved
junaway marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| }, | ||
|
Comment on lines
+309
to
+332
|
||
| "description": "Exact match evaluator implemented in TypeScript.", | ||
| }, | ||
| ], | ||
| "settings_template": { | ||
| "requires_llm_api_keys": { | ||
| "label": "Requires LLM API Key(s)", | ||
|
|
@@ -310,10 +345,18 @@ | |
| "code": { | ||
| "label": "Evaluation Code", | ||
| "type": "code", | ||
| "default": "from typing import Dict, Union, Any\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]], # output of the llm app\n correct_answer: str # contains the testset row \n) -> float:\n if output in correct_answer:\n return 1.0\n else:\n return 0.0\n", | ||
| "default": "from typing import Dict, Union, Any\n\n\ndef evaluate(\n app_params: Dict[str, str], # deprecated; currently receives {}\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]],\n correct_answer: str,\n) -> float:\n if output == correct_answer:\n return 1.0\n return 0.0\n", | ||
| "description": "Code for evaluating submissions", | ||
junaway marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| "required": True, | ||
| }, | ||
| "runtime": { | ||
| "label": "Runtime", | ||
| "type": "multiple_choice", | ||
| "default": "python", | ||
| "options": ["python", "javascript", "typescript"], | ||
| "advanced": True, | ||
| "description": "Runtime environment used to execute the evaluator code.", | ||
| }, | ||
| "correct_answer_key": { | ||
| "label": "Expected Answer Column", | ||
| "default": "correct_answer", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -98,15 +98,16 @@ async def evaluator_run( | |
| workspace_id=str(request.state.workspace_id), | ||
| organization_id=str(request.state.organization_id), | ||
| ) | ||
| credentials = f"Secret {secret_token}" | ||
|
|
||
| with tracing_context_manager(TracingContext.get()): | ||
| tracing_ctx = TracingContext.get() | ||
| tracing_ctx.credentials = f"Secret {secret_token}" | ||
| tracing_ctx = TracingContext.get() | ||
| tracing_ctx.credentials = credentials | ||
|
|
||
| with running_context_manager(RunningContext.get()): | ||
| running_ctx = RunningContext.get() | ||
| running_ctx.credentials = f"Secret {secret_token}" | ||
| ctx = RunningContext.get() | ||
| ctx.credentials = credentials | ||
|
|
||
| with tracing_context_manager(tracing_ctx): | ||
|
Comment on lines
+103
to
+109
|
||
| with running_context_manager(ctx): | ||
| try: | ||
| result = await evaluators_service.run( | ||
| evaluator_key=evaluator_key, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| /** | ||
| * Character Count Match Test (JavaScript) | ||
| * ====================================== | ||
| * | ||
| * Simple evaluator that compares character counts for output vs correct answer. | ||
| * This mirrors the Python exact_match example without NumPy. | ||
| */ | ||
|
|
||
| function evaluate(appParams, inputs, output, correctAnswer) { | ||
| void appParams | ||
| void inputs | ||
junaway marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
junaway marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| try { | ||
| const outputStr = | ||
| typeof output === "string" ? output : JSON.stringify(output) | ||
| const answerStr = String(correctAnswer) | ||
|
|
||
| return outputStr.length === answerStr.length ? 1.0 : 0.0 | ||
| } catch { | ||
| return 0.0 | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| """ | ||
| Agenta Config Endpoint Test | ||
| ============================ | ||
| Tests Agenta config endpoint availability using requests. | ||
| """ | ||
|
|
||
| from typing import Dict, Union, Any | ||
| import os | ||
|
|
||
|
|
||
| def evaluate( | ||
| app_params: Dict[str, str], | ||
| inputs: Dict[str, str], | ||
| output: Union[str, Dict[str, Any]], | ||
| correct_answer: str, | ||
| ) -> float: | ||
| try: | ||
| import requests | ||
| except ImportError: | ||
| return 0.0 | ||
|
|
||
| try: | ||
| host = os.environ.get("AGENTA_HOST") | ||
| credentials = os.environ.get("AGENTA_CREDENTIALS") | ||
|
|
||
| if not host: | ||
| return 0.6 | ||
|
|
||
| if not credentials: | ||
| return 0.601 | ||
|
|
||
| headers = dict( | ||
| Authorization=credentials, | ||
| ) | ||
|
|
||
| refs = dict( | ||
| application_ref=dict( | ||
| slug="prompt", | ||
| ), | ||
| environment_ref=dict( | ||
| slug="development", | ||
| ), | ||
| ) | ||
|
|
||
| response = requests.post( | ||
| f"{host}/api/variants/configs/fetch", | ||
| headers=headers, | ||
| json=refs, | ||
| timeout=10, | ||
| ) | ||
|
|
||
| return float(response.status_code) / 1000.0 | ||
junaway marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| except Exception: | ||
| return 0.602 | ||
junaway marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| """ | ||
| Agenta Health Endpoint Test | ||
| ============================ | ||
| Tests Agenta API health endpoint availability using requests. | ||
| """ | ||
|
|
||
| from typing import Dict, Union, Any | ||
| import os | ||
|
|
||
|
|
||
| def evaluate( | ||
| app_params: Dict[str, str], | ||
| inputs: Dict[str, str], | ||
| output: Union[str, Dict[str, Any]], | ||
| correct_answer: str, | ||
| ) -> float: | ||
| try: | ||
| import requests | ||
| except ImportError: | ||
| return 0.0 | ||
|
|
||
| try: | ||
| host = os.environ.get("AGENTA_HOST") | ||
|
|
||
| if not host: | ||
| return 0.6 | ||
|
|
||
| response = requests.get( | ||
| f"{host}/api/health", | ||
| timeout=10, | ||
| ) | ||
|
|
||
| return float(response.status_code) / 1000.0 | ||
junaway marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| except Exception: | ||
| return 0.602 | ||
junaway marked this conversation as resolved.
Show resolved
Hide resolved
junaway marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| """ | ||
| Agenta Secrets Endpoint Test | ||
| ============================= | ||
| Tests Agenta secrets endpoint availability using requests. | ||
| """ | ||
|
|
||
| from typing import Dict, Union, Any | ||
| import os | ||
|
|
||
|
|
||
| def evaluate( | ||
| app_params: Dict[str, str], | ||
| inputs: Dict[str, str], | ||
| output: Union[str, Dict[str, Any]], | ||
| correct_answer: str, | ||
| ) -> float: | ||
| try: | ||
| import requests | ||
| except ImportError: | ||
| return 0.0 | ||
|
|
||
| try: | ||
| host = os.environ.get("AGENTA_HOST") | ||
| credentials = os.environ.get("AGENTA_CREDENTIALS") | ||
|
|
||
| if not host: | ||
| return 0.6 | ||
|
|
||
| if not credentials: | ||
| return 0.601 | ||
|
|
||
| headers = dict( | ||
| Authorization=credentials, | ||
| ) | ||
|
|
||
| response = requests.get( | ||
| f"{host}/api/vault/v1/secrets/", | ||
| headers=headers, | ||
| timeout=10, | ||
| ) | ||
|
|
||
| return float(response.status_code) / 1000.0 | ||
junaway marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| except Exception: | ||
| return 0.602 | ||
junaway marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Uh oh!
There was an error while loading. Please reload this page.