diff --git a/.gitignore b/.gitignore index 83e4e75..4ef3b31 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,11 @@ __pycache__/ external/* **/uv.lock *.egg-info/ -**/.venv/ \ No newline at end of file +**/.venv/ +.env +runs/ +runs_test/ +notebooks/01_smoke_runner_with_output.ipynb +notebooks/01_m1_minimal_api_with_output.ipynb +/.tmp_runs_run +/.tmp_runs_validate diff --git a/README.md b/README.md index 3423365..bf586a2 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,70 @@ Currently, we are adding problems/domains one folder at a time. The instructions to run each task are located inside the task folder. +## Quick Start (Runner/CLI) + +```bash +# M1 review checklist (recommended order) +# 1) List tasks (LLM4AD + example stubs) +trace-bench list-tasks --root LLM4AD/benchmark_tasks + +# 2) Validate a config +trace-bench validate --config configs/smoke.yaml + +# 3) Run Stub smoke (deterministic, no keys) +trace-bench run --config configs/smoke.yaml --runs-dir runs + +# 4) Run Real smoke (requires OPENAI_API_KEY) +trace-bench run --config configs/smoke_real.yaml --runs-dir runs + +# 5) Run tests (disable external plugin autoload) +PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 pytest -q + +# List tasks (LLM4AD + example stubs) +trace-bench list-tasks --root LLM4AD/benchmark_tasks + +# Validate a config +trace-bench validate --config configs/smoke.yaml + +# Run a smoke benchmark +trace-bench run --config configs/smoke.yaml + +# Launch UI (stub) +trace-bench ui --runs-dir runs +``` + +Expected run artifacts: +- `runs//config.snapshot.yaml` +- `runs//env.json` +- `runs//results.csv` +- `runs//events.jsonl` +- `runs//summary.json` +- `runs//tb/` + +## M1 Dependencies (Required for Full Pass) + +System: +- Graphviz (system package) + +Python: +- `graphviz`, `pyyaml`, `pytest`, `numpy`, `matplotlib`, `litellm==1.75.0` + +OpenTrace examples strict smoke (for 100% pass): +- `datasets`, `textgrad`, `dspy`, `autogen`, `python-dotenv` + +## OpenTrace Examples Smoke (100% Pass Mode) + +To enforce 100% example smoke in CI, run: +```bash +TRACE_BENCH_STRICT_EXAMPLES=1 PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 pytest -q +``` +Without strict mode, the smoke test skips only when optional deps are missing. + +## VeriBench Status (In Scope, Pending Input) + +VeriBench is in scope but requires the Trace team to provide the task entrypoint/task list. +CLI flags are ready (`--bench veribench`); when the entrypoint is unavailable, tasks are skipped with a structured reason rather than raising. + ## Problem Sets ### General Problem Sets @@ -27,9 +91,9 @@ Current implementation of graph is a single node. **Supported Algorithms:** PrioritySearch, GEPA-Base, GEPA-UCB, GEPA-Beam -πŸ“– **[See detailed usage guide β†’](LM4AD/readme.md)** +**See detailed usage guide:** `LM4AD/readme.md` ## Agent Architecture - ReAct agent -All the libraries from other repos are stored and managed in the `external` folder -- this folder will be created if one of the `install.sh` script is run inside the task folder. \ No newline at end of file +All the libraries from other repos are stored and managed in the `external` folder -- this folder will be created if one of the `install.sh` script is run inside the task folder. diff --git a/configs/m1_matrix_smoke.yaml b/configs/m1_matrix_smoke.yaml new file mode 100644 index 0000000..3ba1b6e --- /dev/null +++ b/configs/m1_matrix_smoke.yaml @@ -0,0 +1,24 @@ +runs_dir: runs +mode: stub +seeds: [123] +max_workers: 1 +fail_fast: false + +tasks: + - id: internal:numeric_param + - id: llm4ad:circle_packing + eval_kwargs: + timeout_seconds: 10 + +trainers: + - id: PrioritySearch + params_variants: + - ps_steps: 1 + ps_batches: 1 + + - id: GEPA-Base + params_variants: + - gepa_iters: 1 + gepa_train_bs: 2 + gepa_merge_every: 2 + gepa_pareto_subset: 2 diff --git a/configs/m1_validation.yaml b/configs/m1_validation.yaml new file mode 100644 index 0000000..fdbe511 --- /dev/null +++ b/configs/m1_validation.yaml @@ -0,0 +1,55 @@ +runs_dir: runs +mode: stub +seeds: [123] +max_workers: 1 +fail_fast: false + +tasks: + - id: internal:code_param + - id: internal:numeric_param + - id: internal:multi_param + - id: internal:non_trainable + - id: trace_examples:greeting_stub + - id: llm4ad:circle_packing + eval_kwargs: + timeout_seconds: 10 + - id: veribench:smoke_placeholder + +trainers: + - id: PrioritySearch + params_variants: + - threads: 2 + ps_steps: 1 + ps_batches: 1 + ps_candidates: 2 + ps_proposals: 2 + ps_mem_update: 1 + + - id: GEPA-Base + params_variants: + - threads: 2 + gepa_iters: 1 + gepa_train_bs: 2 + gepa_merge_every: 2 + gepa_pareto_subset: 2 + optimizer: OPROv2 + optimizer_kwargs: {} + + - id: GEPA-UCB + params_variants: + - threads: 2 + gepa_iters: 1 + gepa_train_bs: 2 + gepa_merge_every: 2 + gepa_pareto_subset: 2 + + - id: GEPA-Beam + params_variants: + - threads: 2 + gepa_iters: 1 + gepa_train_bs: 2 + gepa_merge_every: 2 + gepa_pareto_subset: 2 + +eval_kwargs: + timeout_seconds: 10 diff --git a/configs/smoke.yaml b/configs/smoke.yaml new file mode 100644 index 0000000..8455c9f --- /dev/null +++ b/configs/smoke.yaml @@ -0,0 +1,12 @@ +runs_dir: runs +mode: stub +seeds: [123] + +tasks: + - id: internal:numeric_param + +trainers: + - id: PrioritySearch + params_variants: + - ps_steps: 1 + ps_batches: 1 diff --git a/configs/smoke_real.yaml b/configs/smoke_real.yaml new file mode 100644 index 0000000..2ebb27d --- /dev/null +++ b/configs/smoke_real.yaml @@ -0,0 +1,12 @@ +runs_dir: runs +mode: real +seeds: [123] + +tasks: + - id: trace_examples:greeting_stub + +trainers: + - id: PrioritySearch + params_variants: + - ps_steps: 1 + ps_batches: 1 diff --git a/notebooks/01_m1_minimal_api.ipynb b/notebooks/01_m1_minimal_api.ipynb new file mode 100644 index 0000000..410c05e --- /dev/null +++ b/notebooks/01_m1_minimal_api.ipynb @@ -0,0 +1,1634 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "euYNX4m-m0Ty" + }, + "source": [ + "# Trace-Bench M1 β€” Minimal API Validation\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guru-code-expert/Trace-Bench/blob/m1/deliverable/notebooks/01_m1_minimal_api.ipynb)\n", + "\n", + "This notebook validates the **M1 contracts**: canonical artifacts, deterministic IDs, and minimal runnable coverage across benches.\n", + "\n", + "**Mode policy**: defaults to **real** (uses API key if present). If no key is found, falls back to **stub** with a clear warning and STUB label on outputs." + ], + "id": "euYNX4m-m0Ty" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u5DVjcAAm0UH" + }, + "source": [ + "## Expected Outputs\n", + "\n", + "- A new `runs//` folder with `meta/` + `jobs/` layout.\n", + "- `meta/config.snapshot.yaml`, `meta/manifest.json`, `meta/env.json` exist.\n", + "- `results.csv` contains `status` values (`ok`/`failed`/`skipped`).\n", + "- Internal non-trainable job shows `status=failed` with reason.\n", + "- If running in **real** mode, artifacts show `mode=real` and LLM4AD task produces a score.\n", + "- **2x2 matrix smoke**: `results.csv` with exactly 4 rows from 2 tasks x 2 trainers x 1 seed." + ], + "id": "u5DVjcAAm0UH" + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "8D3DGyVXm0UJ", + "outputId": "2b621443-f1f0-45c2-bbec-d8f0803ea933", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n", + "Runs dir: /content/drive/MyDrive/bench/2026-02-11/trace_bench\n", + "API key found - running in REAL mode (model: openrouter/openai/gpt-4o-mini)\n", + "\n", + "Mode: real\n" + ] + } + ], + "source": [ + "# Mount Drive (optional) + compute persistent runs_dir + detect API key\n", + "from datetime import date\n", + "from pathlib import Path\n", + "import os\n", + "\n", + "try:\n", + " from google.colab import drive\n", + " drive.mount(\"/content/drive\")\n", + "except Exception:\n", + " pass\n", + "\n", + "\n", + "def bench_dir(project=\"bench\", sub=\"trace_bench\", local=\"/content/bench\"):\n", + " drive_root = Path(\"/content/drive/MyDrive\")\n", + " root = drive_root if drive_root.is_dir() else Path(local)\n", + " out = root / project / date.today().isoformat() / sub\n", + " out.mkdir(parents=True, exist_ok=True)\n", + " return str(out)\n", + "\n", + "RUNS_DIR = bench_dir()\n", + "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n", + "print(\"Runs dir:\", RUNS_DIR)\n", + "\n", + "# --- Auto-detect API key (real mode by default) ---\n", + "API_KEY = os.environ.get(\"OPENROUTER_API_KEY\", \"\")\n", + "if not API_KEY:\n", + " try:\n", + " from google.colab import userdata\n", + " API_KEY = userdata.get(\"OPENROUTER_API_KEY\") or \"\"\n", + " except Exception:\n", + " pass\n", + "\n", + "MODEL = os.environ.get(\"OPENROUTER_MODEL\", \"openrouter/openai/gpt-4o-mini\")\n", + "\n", + "if API_KEY:\n", + " os.environ[\"OPENROUTER_API_KEY\"] = API_KEY\n", + " # Compatibility for OpenAI-style clients used internally by optimizers.\n", + " os.environ[\"OPENAI_API_KEY\"] = API_KEY\n", + " os.environ[\"OPENAI_API_BASE\"] = \"https://openrouter.ai/api/v1\"\n", + " os.environ[\"OPENAI_BASE_URL\"] = \"https://openrouter.ai/api/v1\"\n", + " os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n", + " os.environ[\"TRACE_LITELLM_MODEL\"] = MODEL\n", + " MODE = \"real\"\n", + " print(f\"API key found - running in REAL mode (model: {MODEL})\")\n", + "else:\n", + " MODE = \"stub\"\n", + " print(\"WARNING: No OPENROUTER_API_KEY found. Falling back to STUB mode.\")\n", + " print(\" All outputs below are labeled STUB - not real LLM results.\")\n", + "\n", + "os.environ[\"TB_MODE\"] = MODE\n", + "print(f\"\\nMode: {MODE}\")" + ], + "id": "8D3DGyVXm0UJ" + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "swOi3Bhtm0UQ", + "outputId": "318e618c-53c0-407e-d757-0ade4d0b5ff1", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'Trace-Bench'...\n", + "remote: Enumerating objects: 317, done.\u001b[K\n", + "remote: Counting objects: 100% (317/317), done.\u001b[K\n", + "remote: Compressing objects: 100% (219/219), done.\u001b[K\n", + "remote: Total 317 (delta 42), reused 282 (delta 41), pack-reused 0 (from 0)\u001b[K\n", + "Receiving objects: 100% (317/317), 3.85 MiB | 15.92 MiB/s, done.\n", + "Resolving deltas: 100% (42/42), done.\n", + "Cloning into 'OpenTrace'...\n", + "remote: Enumerating objects: 228, done.\u001b[K\n", + "remote: Counting objects: 100% (228/228), done.\u001b[K\n", + "remote: Compressing objects: 100% (205/205), done.\u001b[K\n", + "remote: Total 228 (delta 17), reused 115 (delta 13), pack-reused 0 (from 0)\u001b[K\n", + "Receiving objects: 100% (228/228), 4.73 MiB | 9.13 MiB/s, done.\n", + "Resolving deltas: 100% (17/17), done.\n", + "/content/Trace-Bench\n", + "Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease\n", + "Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n", + "Get:3 https://cli.github.com/packages stable InRelease [3,917 B]\n", + "Get:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n", + "Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n", + "Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n", + "Get:7 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n", + "Get:8 https://cli.github.com/packages stable/main amd64 Packages [356 B]\n", + "Get:9 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [4,040 kB]\n", + "Get:10 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,609 kB]\n", + "Get:11 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [85.0 kB]\n", + "Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [70.9 kB]\n", + "Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [6,678 kB]\n", + "Get:14 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]\n", + "Get:15 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,895 kB]\n", + "Get:16 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]\n", + "Get:17 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,297 kB]\n", + "Get:18 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,728 kB]\n", + "Get:19 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [38.8 kB]\n", + "Get:20 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy/main amd64 Packages [75.3 kB]\n", + "Get:21 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,683 kB]\n", + "Get:22 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [6,396 kB]\n", + "Get:23 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [62.6 kB]\n", + "Fetched 37.1 MB in 4s (9,313 kB/s)\n", + "Reading package lists... Done\n", + "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n", + "Reading package lists... Done\n", + "Building dependency tree... Done\n", + "Reading state information... Done\n", + "graphviz is already the newest version (2.42.2-6ubuntu0.1).\n", + "0 upgraded, 0 newly installed, 0 to remove and 57 not upgraded.\n", + "Requirement already satisfied: pip in /usr/local/lib/python3.12/dist-packages (24.1.2)\n", + "Collecting pip\n", + " Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)\n", + "Downloading pip-26.0.1-py3-none-any.whl (1.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m19.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: pip\n", + " Attempting uninstall: pip\n", + " Found existing installation: pip 24.1.2\n", + " Uninstalling pip-24.1.2:\n", + " Successfully uninstalled pip-24.1.2\n", + "Successfully installed pip-26.0.1\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (6.0.3)\n", + "Requirement already satisfied: pytest in /usr/local/lib/python3.12/dist-packages (8.4.2)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", + "Requirement already satisfied: graphviz in /usr/local/lib/python3.12/dist-packages (0.21)\n", + "Collecting litellm==1.75.0\n", + " Downloading litellm-1.75.0-py3-none-any.whl.metadata (40 kB)\n", + "Requirement already satisfied: aiohttp>=3.10 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.13.3)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.3.1)\n", + "Requirement already satisfied: httpx>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.28.1)\n", + "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.7.1)\n", + "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.1.6)\n", + "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (4.26.0)\n", + "Requirement already satisfied: openai>=1.68.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.17.0)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.12.3)\n", + "Requirement already satisfied: python-dotenv>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (1.2.1)\n", + "Requirement already satisfied: tiktoken>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.12.0)\n", + "Requirement already satisfied: tokenizers in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.22.2)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2<4.0.0,>=3.1.2->litellm==1.75.0) (3.0.3)\n", + "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (25.4.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (2025.9.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.37.0)\n", + "Requirement already satisfied: rpds-py>=0.25.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.30.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (2.41.4)\n", + "Requirement already satisfied: typing-extensions>=4.14.1 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (4.15.0)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.4.2)\n", + "Requirement already satisfied: iniconfig>=1 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.3.0)\n", + "Requirement already satisfied: packaging>=20 in /usr/local/lib/python3.12/dist-packages (from pytest) (26.0)\n", + "Requirement already satisfied: pluggy<2,>=1.5 in /usr/local/lib/python3.12/dist-packages (from pytest) (1.6.0)\n", + "Requirement already satisfied: pygments>=2.7.2 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.19.2)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", + "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (2.9.0.post0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.4.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.8.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (6.7.1)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (0.4.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.22.0)\n", + "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.12/dist-packages (from yarl<2.0,>=1.17.0->aiohttp>=3.10->litellm==1.75.0) (3.11)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (4.12.1)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (2026.1.4)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (1.0.9)\n", + "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx>=0.23.0->litellm==1.75.0) (0.16.0)\n", + "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.12/dist-packages (from importlib-metadata>=6.8.0->litellm==1.75.0) (3.23.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.9.0)\n", + "Requirement already satisfied: jiter<1,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (0.13.0)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.3.1)\n", + "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (4.67.3)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n", + "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2025.11.3)\n", + "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2.32.4)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (3.4.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (2.5.0)\n", + "Requirement already satisfied: huggingface-hub<2.0,>=0.16.4 in /usr/local/lib/python3.12/dist-packages (from tokenizers->litellm==1.75.0) (1.4.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (3.20.3)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (2025.3.0)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.2.0)\n", + "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.5.4)\n", + "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (0.21.1)\n", + "Downloading litellm-1.75.0-py3-none-any.whl (8.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m62.3 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: litellm\n", + "Successfully installed litellm-1.75.0\n" + ] + } + ], + "source": [ + "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n", + "!git clone --depth 1 --branch m1/deliverable https://github.com/guru-code-expert/Trace-Bench.git\n", + "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n", + "\n", + "%cd Trace-Bench\n", + "\n", + "# System + Python deps\n", + "!apt-get update -y && apt-get install -y graphviz\n", + "!python -m pip install -U pip\n", + "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0\n" + ], + "id": "swOi3Bhtm0UQ" + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "a__iRJTHm0UR", + "outputId": "3f85acb0-c2be-4ae4-fcf7-3aea796bf95b", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "=== List trainers ===\n", + "AggregatedUpdate\tavailable\n", + "BasicSearchAlgorithm\tavailable\n", + "BeamSearch\tavailable\n", + "BeamsearchAlgorithm\tavailable\n", + "BeamsearchHistoryAlgorithm\tavailable\n", + "GEPA-Base\tavailable\n", + "GEPA-Beam\tavailable\n", + "GEPA-UCB\tavailable\n", + "Minibatch\tavailable\n", + "MinibatchAlgorithm\tavailable\n", + "PrioritySearch\tavailable\n", + "PrioritySearch_with_Regressor\tavailable\n", + "SearchTemplate\tavailable\n", + "SequentialSearch\tavailable\n", + "SequentialUpdate\tavailable\n", + "StreamingPrioritySearch\tavailable\n", + "UCBSearchAlgorithm\tavailable\n", + "\n", + "=== Validate config (strict) ===\n", + "[OK] internal:code_param\n", + "[OK] internal:numeric_param\n", + "[OK] internal:multi_param\n", + "[OK] internal:non_trainable\n", + "[EXPECTED] internal:non_trainable: no_trainable_parameters\n", + "[OK] trace_examples:greeting_stub\n", + "[OK] llm4ad:circle_packing\n", + "[SKIP] veribench:smoke_placeholder: veribench_unavailable: entrypoint not available (install Veribench or provide task list)\n", + "\n", + "[OK] matrix: 28 jobs expanded deterministically\n", + " job 741cb015f747: internal:code_param x PrioritySearch (seed=123)\n", + " job deec0f7230de: internal:code_param x GEPA-Base (seed=123)\n", + " job 09eba11e01cf: internal:code_param x GEPA-UCB (seed=123)\n", + " job 2baa9d102ae9: internal:code_param x GEPA-Beam (seed=123)\n", + " job 84b26f14a134: internal:numeric_param x PrioritySearch (seed=123)\n", + " job 2cdd86425cca: internal:numeric_param x GEPA-Base (seed=123)\n", + " job 0fdc0343cc34: internal:numeric_param x GEPA-UCB (seed=123)\n", + " job ce5b3461d160: internal:numeric_param x GEPA-Beam (seed=123)\n", + " job 9531e7285512: internal:multi_param x PrioritySearch (seed=123)\n", + " job e8011aad9336: internal:multi_param x GEPA-Base (seed=123)\n", + " job ecd3fbbd3c42: internal:multi_param x GEPA-UCB (seed=123)\n", + " job 2dd76882fd19: internal:multi_param x GEPA-Beam (seed=123)\n", + " job d52d40ca6b77: internal:non_trainable x PrioritySearch (seed=123)\n", + " job eb30b13f2e14: internal:non_trainable x GEPA-Base (seed=123)\n", + " job c865b1ec0cbc: internal:non_trainable x GEPA-UCB (seed=123)\n", + " job d870163c477d: internal:non_trainable x GEPA-Beam (seed=123)\n", + " job 3a1216485e9b: trace_examples:greeting_stub x PrioritySearch (seed=123)\n", + " job 8538a43564b6: trace_examples:greeting_stub x GEPA-Base (seed=123)\n", + " job 28906417633f: trace_examples:greeting_stub x GEPA-UCB (seed=123)\n", + " job 94315da580b9: trace_examples:greeting_stub x GEPA-Beam (seed=123)\n", + " job 1dda87fd7ae7: llm4ad:circle_packing x PrioritySearch (seed=123)\n", + " job 4e4ef0c85cf3: llm4ad:circle_packing x GEPA-Base (seed=123)\n", + " job 977a714b5483: llm4ad:circle_packing x GEPA-UCB (seed=123)\n", + " job 6f9dc2e38ac8: llm4ad:circle_packing x GEPA-Beam (seed=123)\n", + " job e7fa76b4eab5: veribench:smoke_placeholder x PrioritySearch (seed=123)\n", + " job 51c6a932b453: veribench:smoke_placeholder x GEPA-Base (seed=123)\n", + " job e006c4e16c3b: veribench:smoke_placeholder x GEPA-UCB (seed=123)\n", + " job a0147226edd9: veribench:smoke_placeholder x GEPA-Beam (seed=123)\n", + "\n", + " tasks: ['internal:code_param', 'internal:multi_param', 'internal:non_trainable', 'internal:numeric_param', 'llm4ad:circle_packing', 'trace_examples:greeting_stub', 'veribench:smoke_placeholder']\n", + " trainers: ['GEPA-Base', 'GEPA-Beam', 'GEPA-UCB', 'PrioritySearch']\n", + "[OK] manifest written: /content/drive/MyDrive/bench/2026-02-11/trace_bench/20260211-131538-8e24e6b0/meta/manifest.json\n", + "\n", + "=== Generate M1 run config (mode=real) ===\n", + "Config mode: real\n", + "\n", + "=== Run M1 validation ===\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: 1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: 1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n", + " return code\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 1\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 2\n", + "[Step 1] Update/best_candidate_priority: 1.0\n", + "[Step 1] Update/best_candidate_mean_score: 1.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 1\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 1.0\n", + "[Step 1] Sample/num_samples: 1\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 3\u001b[0m\n", + "[Step 1] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n", + " return code\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -3.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -3.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 3\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:1: 1.0\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:2: 1.0\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", + " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -0.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 3\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:1: 2.0\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:2: 1.0\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", + " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: 1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: 1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code3_copy:0: def compose(self, greeting, name: str):\n", + " greeting_value = getattr(greeting, \"data\", greeting)\n", + " return f\"{greeting_value}, {name}!\"\u001b[0m\n", + "[Step 0] \u001b[91mParameter/str:20: Hello\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 1\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 2\n", + "[Step 1] Update/best_candidate_priority: 1.0\n", + "[Step 1] Update/best_candidate_mean_score: 1.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 1\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 1.0\n", + "[Step 1] Sample/num_samples: 1\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 3\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code3_copy:0: def compose(self, greeting, name: str):\n", + " greeting_value = getattr(greeting, \"data\", greeting)\n", + " return f\"{greeting_value}, {name}!\"\u001b[0m\n", + "[Step 1] \u001b[91mParameter/str:20: Hello\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1000000.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1000000.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code:4: import numpy as np\n", + "import math\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " grid_size = int(np.ceil(np.sqrt(n)))\n", + " radius = 0.5 / grid_size\n", + "\n", + " circles = []\n", + " for i in range(n):\n", + " row = i // grid_size\n", + " col = i % grid_size\n", + " x = (col + 0.5) / grid_size\n", + " y = (row + 0.5) / grid_size\n", + " circles.append([x, y, radius])\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.063446105401886\n", + "[Step 1] \u001b[94mAlgo/Average train score: -499999.5240756148\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 1.063446105401886\n", + "[Step 1] Update/best_candidate_mean_score: 1.063446105401886\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.9518487703269418\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.9518487703269418\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: 0.9518487703269418\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code:4: import numpy as np\n", + "import math\n", + "\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " np.random.seed(2025)\n", + " circles = []\n", + " attempts = 0\n", + " max_attempts = 10000 # Limit attempts to prevent infinite loop\n", + "\n", + " while len(circles) < n and attempts < max_attempts:\n", + " radius = np.random.uniform(0.01, 0.1) # Random radius\n", + " x = np.random.uniform(radius, 1 - radius)\n", + " y = np.random.uniform(radius, 1 - radius)\n", + " \n", + " # Check for overlap\n", + " overlap = False\n", + " for (cx, cy, cr) in circles:\n", + " distance = math.sqrt((cx - x) ** 2 + (cy - y) ** 2)\n", + " if distance < (cr + radius):\n", + " overlap = True\n", + " break\n", + "\n", + " if not overlap:\n", + " circles.append([x, y, radius])\n", + "\n", + " attempts += 1\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.3509495181645703\u001b[0m\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_run.yaml <\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
run_idjob_idtask_idsuitetrainer_idseedstatusscore_initialscore_finalscore_besttime_secondsresolved_trainer_kwargsresolved_optimizer_kwargseval_kwargsfeedbacktb_logdir
020260211-131540-093c5358741cb015f747internal:code_paraminternalPrioritySearch123ok1.01.01.04.113878{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/741cb015f747/tb
120260211-131540-093c5358deec0f7230deinternal:code_paraminternalGEPA-Base123ok1.01.01.02.601084{\"merge_every\": 2, \"num_iters\": 1, \"num_thread...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/deec0f7230de/tb
220260211-131540-093c535884b26f14a134internal:numeric_paraminternalPrioritySearch123ok-3.0-0.0-0.06.988559{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/84b26f14a134/tb
320260211-131540-093c53582cdd86425ccainternal:numeric_paraminternalGEPA-Base123ok-0.0-0.0-0.04.882765{\"merge_every\": 2, \"num_iters\": 1, \"num_thread...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/2cdd86425cca/tb
420260211-131540-093c53589531e7285512internal:multi_paraminternalPrioritySearch123ok-1.0-0.0-0.09.249504{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Make a+b matc...{\"timeout_seconds\": 10}target=3.0jobs/9531e7285512/tb
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 14,\n \"fields\": [\n {\n \"column\": \"run_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"20260211-131540-093c5358\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"job_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"8538a43564b6\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"internal:code_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"trace_examples\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_initial\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 316227.80338516145,\n \"min\": -1000000.0,\n \"max\": 1.063446105401886,\n \"num_unique_values\": 6,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_final\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5618951165634496,\n \"min\": -0.0,\n \"max\": 1.3509495181645703,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5618951165634496,\n \"min\": -0.0,\n \"max\": 1.3509495181645703,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"time_seconds\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8.042801912329264,\n \"min\": 0.000113,\n \"max\": 30.771424,\n \"num_unique_values\": 14,\n \"samples\": [\n 0.604331\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_trainer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"{\\\"merge_every\\\": 2, \\\"num_iters\\\": 1, \\\"num_threads\\\": 2, \\\"pareto_subset_size\\\": 2, \\\"train_batch_size\\\": 2}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_optimizer_kwargs\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"{\\\"memory_size\\\": 5, \\\"objective\\\": \\\"Match the target code exactly.\\\"}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"eval_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"{\\\"timeout_seconds\\\": 10}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"feedback\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Correct\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tb_logdir\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"jobs/8538a43564b6/tb\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "# Inspect latest run artifacts\n", + "import pathlib, json, pandas as pd\n", + "\n", + "runs_root = pathlib.Path(RUNS_DIR)\n", + "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", + "\n", + "run_dir = None\n", + "for p in reversed(candidates):\n", + " if (p / \"meta\" / \"config.snapshot.yaml\").exists():\n", + " run_dir = p\n", + " break\n", + "\n", + "if run_dir is None:\n", + " for p in reversed(candidates):\n", + " if (p / \"config.snapshot.yaml\").exists():\n", + " run_dir = p\n", + " break\n", + "\n", + "if run_dir is None:\n", + " raise FileNotFoundError(\"No run folder with config snapshot found under RUNS_DIR\")\n", + "\n", + "print(\"Run dir:\", run_dir)\n", + "\n", + "config_path = run_dir / \"meta\" / \"config.snapshot.yaml\"\n", + "env_path = run_dir / \"meta\" / \"env.json\"\n", + "manifest_path = run_dir / \"meta\" / \"manifest.json\"\n", + "\n", + "if not config_path.exists():\n", + " config_path = run_dir / \"config.snapshot.yaml\"\n", + " env_path = run_dir / \"env.json\"\n", + "\n", + "config_text = config_path.read_text()\n", + "print(config_text[:400])\n", + "\n", + "if manifest_path.exists():\n", + " manifest = json.loads(manifest_path.read_text())\n", + " print(\"Jobs in manifest:\", len(manifest.get(\"jobs\", [])))\n", + "\n", + "df = pd.read_csv(run_dir / \"results.csv\")\n", + "df.head()\n" + ], + "id": "ckY1HmQam0UU" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gpkb4-1Em0UW" + }, + "source": [ + "## 2x2 Bounded Matrix Smoke (Plan A+ Pareto)\n", + "\n", + "Run exactly **2 tasks x 2 trainers x 1 seed = 4 jobs** and verify `results.csv` has 4 rows." + ], + "id": "gpkb4-1Em0UW" + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "dMn7PDVgm0UX", + "outputId": "c8da1604-eca7-44c4-9736-894cbf386f67", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "=== 2x2 Matrix Smoke (mode=real) ===\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with only long-term memory.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -3.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -3.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 3\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with only long-term memory.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1000000.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1000000.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code:1: import numpy as np\n", + "import math\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " grid_size = int(np.ceil(np.sqrt(n)))\n", + " radius = 0.5 / grid_size\n", + "\n", + " circles = []\n", + " for i in range(n):\n", + " row = i // grid_size\n", + " col = i % grid_size\n", + " x = (col + 0.5) / grid_size\n", + " y = (row + 0.5) / grid_size\n", + " circles.append([x, y, radius])\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.3000000000000003\n", + "[Step 1] \u001b[94mAlgo/Average train score: -499999.4091384736\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 1.3000000000000003\n", + "[Step 1] Update/best_candidate_mean_score: 1.3000000000000003\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.181723052700943\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.181723052700943\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: 1.181723052700943\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code:1: import numpy as np\n", + "import math\n", + "\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + " \n", + " np.random.seed(2025)\n", + " radius = 0.05 # Set a fixed radius for simplicity\n", + " circles = []\n", + " \n", + " for _ in range(n):\n", + " while True:\n", + " x = np.random.uniform(radius, 1 - radius)\n", + " y = np.random.uniform(radius, 1 - radius)\n", + " # Check for overlap\n", + " overlap = False\n", + " for circle in circles:\n", + " if ((x - circle[0]) ** 2 + (y - circle[1]) ** 2) < (2 * radius) ** 2:\n", + " overlap = True\n", + " break\n", + " if not overlap:\n", + " circles.append([x, y, radius])\n", + " break\n", + " \n", + " return np.array(circles)\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.3000000000000003\u001b[0m\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_matrix.yaml <\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
task_idsuitetrainer_idseedstatusscore_best
0internal:numeric_paraminternalPrioritySearch123ok-0.0
1internal:numeric_paraminternalGEPA-Base123ok-0.0
2llm4ad:circle_packingllm4adPrioritySearch123ok1.3
3llm4ad:circle_packingllm4adGEPA-Base123ok1.3
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df[[\\\"task_id\\\", \\\"suite\\\", \\\"trainer_id\\\", \\\"seed\\\", \\\"status\\\", \\\"score_best\\\"]]\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad:circle_packing\",\n \"internal:numeric_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad\",\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\",\n \"PrioritySearch\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.7505553499465139,\n \"min\": -0.0,\n \"max\": 1.3000000000000005,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.3000000000000005\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "# Verify 2x2 matrix: exactly 4 rows in results.csv\n", + "import json, pathlib, pandas as pd\n", + "\n", + "runs_root = pathlib.Path(RUNS_DIR)\n", + "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", + "\n", + "matrix_dir = None\n", + "for p in reversed(candidates):\n", + " summary_path = p / \"summary.json\"\n", + " if not summary_path.exists():\n", + " continue\n", + " try:\n", + " summary = json.loads(summary_path.read_text())\n", + " except Exception:\n", + " continue\n", + " if summary.get(\"total_jobs\") == 4:\n", + " matrix_dir = p\n", + " break\n", + "\n", + "if matrix_dir is None:\n", + " raise FileNotFoundError(\"No matrix run with total_jobs==4 found. Re-run the matrix cell.\")\n", + "\n", + "print(\"Matrix run dir:\", matrix_dir)\n", + "\n", + "df = pd.read_csv(matrix_dir / \"results.csv\")\n", + "print(f\"\\nresults.csv rows: {len(df)} (expected: 4)\")\n", + "assert len(df) == 4, f\"Expected 4 rows, got {len(df)}\"\n", + "\n", + "summary = json.loads((matrix_dir / \"summary.json\").read_text())\n", + "print(f\"summary.json: {summary}\")\n", + "assert summary.get(\"total_jobs\") == 4\n", + "\n", + "print(\"\\n--- Matrix results ---\")\n", + "df[[\"task_id\", \"suite\", \"trainer_id\", \"seed\", \"status\", \"score_best\"]]\n" + ], + "id": "W18tGXfYm0UZ" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10" + }, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/notebooks/01_smoke_runner.ipynb b/notebooks/01_smoke_runner.ipynb new file mode 100644 index 0000000..283fb83 --- /dev/null +++ b/notebooks/01_smoke_runner.ipynb @@ -0,0 +1,213 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Trace-Bench Smoke Runner (Stub + Real)\n", + "\n", + "This notebook validates Trace-Bench in two modes:\n", + "\n", + "- **StubLLM**: deterministic, no API keys\n", + "- **Real LLM**: requires a user-provided API key (via Colab Secrets)\n", + "\n", + "It also shows the standardized run artifacts produced by the CLI." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Expected Outputs (Quick Verification)\n", + "\n", + "You should see the following signals if the notebook is working correctly:\n", + "\n", + "- **Stub smoke run** completes with a new `runs//` folder.\n", + "- `config.snapshot.yaml`, `env.json`, `results.csv`, `events.jsonl` exist in that folder.\n", + "- `results.csv` shows at least one row with `task=example:greeting_stub` and `status=trained`.\n", + "- **Real-LLM smoke** completes (if API key is set) and `results.csv` shows `status=trained`.\n", + "- `pytest -q` ends with `passed` (LLM4AD optimizer tests run only when `OPENAI_API_KEY` is set)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mount Drive (optional) + compute persistent runs_dir\n", + "from datetime import date\n", + "from pathlib import Path\n", + "import os\n", + "\n", + "try:\n", + " from google.colab import drive\n", + " drive.mount(\"/content/drive\")\n", + "except Exception:\n", + " pass\n", + "\n", + "\n", + "def bench_dir(project=\"bench\", sub=\"trace_bench\", local=\"/content/bench\"):\n", + " drive = Path(\"/content/drive/MyDrive\")\n", + " root = drive if drive.is_dir() else Path(local)\n", + " out = root / project / date.today().isoformat() / sub\n", + " out.mkdir(parents=True, exist_ok=True)\n", + " return str(out)\n", + "\n", + "RUNS_DIR = bench_dir()\n", + "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n", + "print(\"Runs dir:\", RUNS_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n", + "!git clone --depth 1 --branch runner-foundation https://github.com/guru-code-expert/Trace-Bench.git\n", + "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n", + "\n", + "%cd Trace-Bench\n", + "\n", + "# System + Python deps\n", + "!apt-get update -y && apt-get install -y graphviz\n", + "!python -m pip install -U pip\n", + "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Optional: list tasks (external bench discovery)\n", + "!python -m trace_bench list-tasks --root LLM4AD/benchmark_tasks | head -n 30" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "cd /content/Trace-Bench\n", + "\n", + "# Stub smoke (internal example task for deterministic output)\n", + "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench run --config configs/smoke.yaml --runs-dir \"$RUNS_DIR\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect latest run artifacts\n", + "import glob, json, pathlib, pandas as pd\n", + "\n", + "latest = sorted(glob.glob(f\"{RUNS_DIR}/*\"))[-1]\n", + "p = pathlib.Path(latest)\n", + "print(p)\n", + "\n", + "print((p / \"config.snapshot.yaml\").read_text()[:400])\n", + "print(json.loads((p / \"env.json\").read_text()).keys())\n", + "\n", + "pd.read_csv(p / \"results.csv\").head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "cd /content/Trace-Bench\n", + "\n", + "# Optional: external LLM4AD smoke (may yield low score if template fails)\n", + "cat > configs/smoke_llm4ad.yaml <<'YAML'\n", + "runs_dir: runs\n", + "mode: stub\n", + "seed: 123\n", + "tasks:\n", + " - circle_packing\n", + "trainers:\n", + " - PrioritySearch\n", + "YAML\n", + "\n", + "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench run --config configs/smoke_llm4ad.yaml --runs-dir \"$RUNS_DIR\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Real LLM (requires API key)\n", + "\n", + "Add `OPENAI_API_KEY` in **Colab Secrets** and run the cells below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load API key from Colab Secrets\n", + "from google.colab import userdata\n", + "import os\n", + "\n", + "key = userdata.get(\"OPENAI_API_KEY\")\n", + "if not key:\n", + " raise RuntimeError(\"Missing OPENAI_API_KEY secret in Colab\")\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = key\n", + "os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n", + "os.environ[\"TRACE_LITELLM_MODEL\"] = \"gpt-4o-mini\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "cd /content/Trace-Bench\n", + "\n", + "# Real-LLM smoke (internal example task)\n", + "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench run --config configs/smoke_real.yaml --runs-dir \"$RUNS_DIR\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "cd /content/Trace-Bench\n", + "\n", + "# Pytest (LLM4AD optimizer test runs only if OPENAI_API_KEY is set)\n", + "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m pytest -q" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..be74aa6 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +testpaths = tests +pythonpath = . +addopts = -p no:langsmith diff --git a/setup.py b/setup.py index 30f3fdb..c879a60 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -import os +ο»Ώimport os import setuptools here = os.path.abspath(os.path.dirname(__file__)) @@ -15,9 +15,10 @@ "black", "scikit-learn", "tensorboardX", - "tensorboard" + "tensorboard", + "pyyaml", ] - + setuptools.setup( name="trace-bench", version=__version__, @@ -27,7 +28,12 @@ license='MIT LICENSE', description="An AutoDiff-like tool for training AI systems end-to-end with general feedback", long_description=open('README.md', encoding="utf8").read(), - packages=setuptools.find_packages(include=["opto*"]), + packages=setuptools.find_packages(include=["trace_bench*", "opto*"]), install_requires=install_requires, python_requires=">=3.9", + entry_points={ + "console_scripts": [ + "trace-bench=trace_bench.cli:main", + ] + }, ) diff --git a/tests/m0/test_config.py b/tests/m0/test_config.py new file mode 100644 index 0000000..71fabf3 --- /dev/null +++ b/tests/m0/test_config.py @@ -0,0 +1,8 @@ +ο»Ώfrom trace_bench.config import load_config + + +def test_load_config_smoke(): + cfg = load_config("configs/smoke.yaml") + assert cfg.mode == "stub" + assert cfg.tasks[0].id == "internal:numeric_param" + assert cfg.runs_dir == "runs" diff --git a/tests/m0/test_runner_smoke.py b/tests/m0/test_runner_smoke.py new file mode 100644 index 0000000..d43891b --- /dev/null +++ b/tests/m0/test_runner_smoke.py @@ -0,0 +1,38 @@ +import csv +import os +from pathlib import Path + +import pytest + +from trace_bench.config import load_config +from trace_bench.runner import BenchRunner + + +def test_runner_smoke(tmp_path): + try: + import graphviz # noqa: F401 + except Exception as exc: # pragma: no cover - dependency check + pytest.fail(f"graphviz is required for smoke: {exc}") + repo_root = Path(__file__).resolve().parents[2] + os.chdir(repo_root) + + cfg = load_config("configs/smoke.yaml") + cfg.runs_dir = str(tmp_path / "runs") + + runner = BenchRunner(cfg) + summary = runner.run() + + assert summary.results + run_dir = Path(cfg.runs_dir) / summary.run_id + assert run_dir.exists() + assert (run_dir / "meta" / "config.snapshot.yaml").exists() + assert (run_dir / "meta" / "env.json").exists() + assert (run_dir / "meta" / "manifest.json").exists() + assert (run_dir / "results.csv").exists() + assert (run_dir / "summary.json").exists() + + with (run_dir / "results.csv").open("r", encoding="utf-8") as f: + rows = list(csv.DictReader(f)) + assert rows + assert "job_id" in rows[0] + assert any(row.get("status") != "skipped" for row in rows) diff --git a/tests/m0/test_stub_llm.py b/tests/m0/test_stub_llm.py new file mode 100644 index 0000000..5d6cc21 --- /dev/null +++ b/tests/m0/test_stub_llm.py @@ -0,0 +1,25 @@ +ο»Ώimport pytest + +from trace_bench.registry import load_task_bundle + + +def _skip_if_missing_deps(exc: Exception): + msg = str(exc).lower() + if "graphviz" in msg or "opto" in msg: + pytest.skip(f"Optional dependency missing: {exc}") + + +def test_example_tasks_load(): + try: + bundle = load_task_bundle("trace_examples:greeting_stub", "LLM4AD/benchmark_tasks") + except Exception as exc: + _skip_if_missing_deps(exc) + raise + assert {"param", "guide", "train_dataset", "optimizer_kwargs", "metadata"}.issubset(bundle.keys()) + + try: + bundle2 = load_task_bundle("trace_examples:train_single_node_stub", "LLM4AD/benchmark_tasks") + except Exception as exc: + _skip_if_missing_deps(exc) + raise + assert {"param", "guide", "train_dataset", "optimizer_kwargs", "metadata"}.issubset(bundle2.keys()) diff --git a/tests/m1/test_artifact_serialization.py b/tests/m1/test_artifact_serialization.py new file mode 100644 index 0000000..e52daa5 --- /dev/null +++ b/tests/m1/test_artifact_serialization.py @@ -0,0 +1,57 @@ +import csv +import json +from pathlib import Path + +from trace_bench.config import RunConfig +from trace_bench.runner import BenchRunner + + +def _run_stub(tmp_path: Path) -> Path: + cfg = RunConfig.from_dict( + { + "mode": "stub", + "seeds": [123], + "tasks": [{"id": "internal:numeric_param"}], + "trainers": [{"id": "PrioritySearch", "params_variants": [{"threads": 2}]}], + } + ) + cfg.runs_dir = str(tmp_path / "runs") + summary = BenchRunner(cfg).run() + return Path(cfg.runs_dir) / summary.run_id + + +def test_no_memory_addresses_in_artifacts(tmp_path): + run_dir = _run_stub(tmp_path) + for path in run_dir.rglob("*"): + if not path.is_file(): + continue + if path.suffix not in {".json", ".jsonl", ".csv"}: + continue + text = path.read_text(encoding="utf-8") + assert "object at 0x" not in text + + +def test_structured_nested_fields_in_outputs(tmp_path): + run_dir = _run_stub(tmp_path) + job_dir = next((run_dir / "jobs").iterdir()) + + meta = json.loads((job_dir / "job_meta.json").read_text(encoding="utf-8")) + assert isinstance(meta["resolved_optimizer_kwargs"], dict) + assert isinstance(meta["resolved_trainer_kwargs"], dict) + + results = json.loads((job_dir / "results.json").read_text(encoding="utf-8")) + assert isinstance(results["resolved_optimizer_kwargs"], dict) + assert isinstance(results["resolved_trainer_kwargs"], dict) + + event_lines = (job_dir / "events.jsonl").read_text(encoding="utf-8").strip().splitlines() + assert event_lines + event = json.loads(event_lines[0]) + assert isinstance(event["resolved_optimizer_kwargs"], dict) + assert isinstance(event["resolved_trainer_kwargs"], dict) + + with (run_dir / "results.csv").open("r", encoding="utf-8") as handle: + rows = list(csv.DictReader(handle)) + assert rows + parsed = json.loads(rows[0]["resolved_optimizer_kwargs"]) + assert isinstance(parsed, dict) + diff --git a/tests/m1/test_artifacts_layout.py b/tests/m1/test_artifacts_layout.py new file mode 100644 index 0000000..618607e --- /dev/null +++ b/tests/m1/test_artifacts_layout.py @@ -0,0 +1,28 @@ +from pathlib import Path + +from trace_bench.config import load_config +from trace_bench.runner import BenchRunner + + +def test_artifacts_layout(tmp_path): + cfg = load_config("configs/smoke.yaml") + cfg.runs_dir = str(tmp_path / "runs") + + summary = BenchRunner(cfg).run() + run_dir = Path(cfg.runs_dir) / summary.run_id + + assert (run_dir / "meta" / "config.snapshot.yaml").exists() + assert (run_dir / "meta" / "env.json").exists() + assert (run_dir / "meta" / "git.json").exists() + assert (run_dir / "meta" / "manifest.json").exists() + assert (run_dir / "results.csv").exists() + assert (run_dir / "summary.json").exists() + + jobs_dir = run_dir / "jobs" + job_dirs = [p for p in jobs_dir.iterdir() if p.is_dir()] + assert job_dirs, "expected at least one job directory" + job_dir = job_dirs[0] + assert (job_dir / "job_meta.json").exists() + assert (job_dir / "results.json").exists() + assert (job_dir / "events.jsonl").exists() + assert (job_dir / "tb").exists() diff --git a/tests/m1/test_internal_tasks.py b/tests/m1/test_internal_tasks.py new file mode 100644 index 0000000..ac5b674 --- /dev/null +++ b/tests/m1/test_internal_tasks.py @@ -0,0 +1,23 @@ +from trace_bench.config import RunConfig +from trace_bench.registry import load_task_bundle +from trace_bench.runner import BenchRunner + + +def test_internal_tasks_load(): + bundle = load_task_bundle("internal:code_param", "LLM4AD/benchmark_tasks") + assert "param" in bundle + bundle2 = load_task_bundle("internal:numeric_param", "LLM4AD/benchmark_tasks") + assert "param" in bundle2 + + +def test_internal_non_trainable_fails(tmp_path): + cfg = RunConfig.from_dict( + { + "tasks": [{"id": "internal:non_trainable"}], + "trainers": [{"id": "PrioritySearch", "params_variants": [{"ps_steps": 1}]}], + "seeds": [123], + } + ) + cfg.runs_dir = str(tmp_path / "runs") + summary = BenchRunner(cfg).run() + assert any(row.get("status") == "failed" for row in summary.results) diff --git a/tests/m1/test_manifest_truth.py b/tests/m1/test_manifest_truth.py new file mode 100644 index 0000000..33109f8 --- /dev/null +++ b/tests/m1/test_manifest_truth.py @@ -0,0 +1,42 @@ +import json +from pathlib import Path + +from trace_bench.config import RunConfig +from trace_bench.runner import BenchRunner + + +def test_manifest_matches_job_meta(tmp_path): + cfg = RunConfig.from_dict( + { + "mode": "stub", + "seeds": [123], + "tasks": [ + {"id": "internal:numeric_param"}, + {"id": "trace_examples:greeting_stub"}, + ], + "trainers": [ + {"id": "PrioritySearch", "params_variants": [{"threads": 2}]}, + {"id": "GEPA-Base", "params_variants": [{"gepa_iters": 1}]}, + ], + } + ) + cfg.runs_dir = str(tmp_path / "runs") + + summary = BenchRunner(cfg).run() + run_dir = Path(cfg.runs_dir) / summary.run_id + manifest = json.loads((run_dir / "meta" / "manifest.json").read_text(encoding="utf-8")) + + assert manifest["jobs"], "expected manifest jobs" + for entry in manifest["jobs"]: + if entry.get("status") == "not_executed": + continue + job_meta_path = run_dir / "jobs" / entry["job_id"] / "job_meta.json" + assert job_meta_path.exists() + job_meta = json.loads(job_meta_path.read_text(encoding="utf-8")) + assert entry["raw_params"] == job_meta["raw_params"] + assert entry["resolved_trainer_kwargs"] == job_meta["resolved_trainer_kwargs"] + assert entry["resolved_optimizer_kwargs"] == job_meta["resolved_optimizer_kwargs"] + assert entry["resolved_guide_kwargs"] == job_meta["resolved_guide_kwargs"] + assert entry["resolved_logger_kwargs"] == job_meta["resolved_logger_kwargs"] + assert entry["eval_kwargs"] == job_meta["eval_kwargs"] + diff --git a/tests/m1/test_matrix.py b/tests/m1/test_matrix.py new file mode 100644 index 0000000..766b194 --- /dev/null +++ b/tests/m1/test_matrix.py @@ -0,0 +1,51 @@ +import csv +import json +from pathlib import Path + +from trace_bench.config import RunConfig, load_config +from trace_bench.matrix import compute_job_id, expand_matrix +from trace_bench.runner import BenchRunner + + +def test_expand_matrix_counts(): + cfg = RunConfig.from_dict( + { + "tasks": [{"id": "internal:numeric_param"}, {"id": "internal:code_param"}], + "trainers": [ + {"id": "PrioritySearch", "params_variants": [{}]}, + {"id": "GEPA-Base", "params_variants": [{}]}, + ], + "seeds": [123], + } + ) + jobs = expand_matrix(cfg) + assert len(jobs) == 4 + + +def test_job_id_stable(): + job_id_1 = compute_job_id("internal:numeric_param", "PrioritySearch", {"ps_steps": 1}, 123) + job_id_2 = compute_job_id("internal:numeric_param", "PrioritySearch", {"ps_steps": 1}, 123) + assert job_id_1 == job_id_2 + + +def test_matrix_smoke_e2e(tmp_path): + """Run 2 tasks x 2 trainers x 1 seed = 4 jobs end-to-end and verify results.""" + cfg = load_config("configs/m1_matrix_smoke.yaml") + cfg.runs_dir = str(tmp_path / "runs") + cfg.mode = "stub" + + summary = BenchRunner(cfg).run() + run_dir = Path(cfg.runs_dir) / summary.run_id + + # results.csv must have exactly 4 data rows + results_csv = run_dir / "results.csv" + assert results_csv.exists() + with open(results_csv) as f: + rows = list(csv.DictReader(f)) + assert len(rows) == 4, f"Expected 4 rows in results.csv, got {len(rows)}" + + # summary.json must aggregate 4 jobs + summary_json = run_dir / "summary.json" + assert summary_json.exists() + summary_data = json.loads(summary_json.read_text()) + assert summary_data["total_jobs"] == 4 diff --git a/tests/m1/test_opentrace_examples_smoke.py b/tests/m1/test_opentrace_examples_smoke.py new file mode 100644 index 0000000..f22b275 --- /dev/null +++ b/tests/m1/test_opentrace_examples_smoke.py @@ -0,0 +1,88 @@ +import os +import re +import subprocess +import sys +from pathlib import Path + +import pytest + + +EXAMPLE_ALLOWLIST = { + "autogen", + "datasets", + "dotenv", + "dspy", + "graphviz", + "textgrad", +} + + +def _open_trace_root() -> Path: + repo_root = Path(__file__).resolve().parents[2] + return repo_root.parent / "OpenTrace" + + +def _example_files() -> list[Path]: + root = _open_trace_root() / "examples" + if not root.exists(): + pytest.skip("OpenTrace examples directory not found") + return sorted([p for p in root.rglob("*.py") if p.is_file()]) + + +def _is_argparse_script(path: Path) -> bool: + try: + text = path.read_text(encoding="utf-8") + except Exception: + return False + return "argparse" in text or "ArgumentParser(" in text + + +def _extract_missing_module(output: str) -> str | None: + match = re.search(r"No module named ['\"]([^'\"]+)['\"]", output) + if match: + return match.group(1) + return None + + +def _run_smoke(path: Path): + env = dict(os.environ) + env["PYTHONPATH"] = str(_open_trace_root()) + + env["TRACE_BENCH_SMOKE"] = "1" + + if _is_argparse_script(path): + cmd = [sys.executable, str(path), "--help"] + else: + cmd = [ + sys.executable, + "-c", + f"import runpy; runpy.run_path(r'{path.as_posix()}', run_name='__not_main__')", + ] + + try: + proc = subprocess.run( + cmd, + env=env, + capture_output=True, + text=True, + cwd=str(path.parent), + timeout=30, + ) + return proc + except subprocess.TimeoutExpired: + raise AssertionError(f"Smoke timed out for {path}") + + +@pytest.mark.parametrize("path", _example_files()) +def test_opentrace_examples_smoke(path: Path): + strict = os.environ.get("TRACE_BENCH_STRICT_EXAMPLES") == "1" + proc = _run_smoke(path) + if proc.returncode == 0: + return + + output = (proc.stdout or "") + "\n" + (proc.stderr or "") + missing = _extract_missing_module(output) + if missing and missing in EXAMPLE_ALLOWLIST and not strict: + pytest.skip(f"Optional dependency missing for {path.name}: {missing}") + + raise AssertionError(f"Smoke failed for {path}:\n{output}") diff --git a/tests/m1/test_threads_mapping.py b/tests/m1/test_threads_mapping.py new file mode 100644 index 0000000..7746ced --- /dev/null +++ b/tests/m1/test_threads_mapping.py @@ -0,0 +1,32 @@ +import csv +import json +from pathlib import Path + +from trace_bench.config import RunConfig +from trace_bench.runner import BenchRunner + + +def test_threads_maps_to_num_threads(tmp_path): + cfg = RunConfig.from_dict( + { + "mode": "stub", + "seeds": [123], + "tasks": [{"id": "internal:numeric_param"}], + "trainers": [{"id": "PrioritySearch", "params_variants": [{"threads": 3}]}], + } + ) + cfg.runs_dir = str(tmp_path / "runs") + + summary = BenchRunner(cfg).run() + run_dir = Path(cfg.runs_dir) / summary.run_id + + job_dirs = [p for p in (run_dir / "jobs").iterdir() if p.is_dir()] + assert job_dirs, "expected at least one job directory" + meta = json.loads((job_dirs[0] / "job_meta.json").read_text(encoding="utf-8")) + assert meta["resolved_trainer_kwargs"]["num_threads"] == 3 + + with (run_dir / "results.csv").open("r", encoding="utf-8") as f: + rows = list(csv.DictReader(f)) + assert rows, "expected at least one results row" + resolved = json.loads(rows[0]["resolved_trainer_kwargs"]) + assert resolved["num_threads"] == 3 diff --git a/tests/m1/test_trainer_config.py b/tests/m1/test_trainer_config.py new file mode 100644 index 0000000..f766c74 --- /dev/null +++ b/tests/m1/test_trainer_config.py @@ -0,0 +1,22 @@ +import pytest + +from trace_bench.config import RunConfig + + +def test_trainer_params_variants_parsed(): + cfg = RunConfig.from_dict( + { + "trainers": [ + { + "id": "PrioritySearch", + "params_variants": [{"ps_steps": 2}], + } + ] + } + ) + assert cfg.trainers[0].params_variants[0]["ps_steps"] == 2 + + +def test_trainer_missing_id_raises(): + with pytest.raises(ValueError): + RunConfig.from_dict({"trainers": [{"params_variants": [{}]}]}) diff --git a/tests/m1/test_validate_runs_dir.py b/tests/m1/test_validate_runs_dir.py new file mode 100644 index 0000000..d881255 --- /dev/null +++ b/tests/m1/test_validate_runs_dir.py @@ -0,0 +1,37 @@ +from pathlib import Path + +from trace_bench.cli import cmd_validate + + +def test_validate_writes_manifest_to_runs_dir(tmp_path): + config_path = tmp_path / "validate.yaml" + config_path.write_text( + "\n".join( + [ + "mode: stub", + "tasks:", + " - id: internal:numeric_param", + "trainers:", + " - id: PrioritySearch", + " params_variants:", + " - threads: 2", + ] + ), + encoding="utf-8", + ) + + runs_dir = tmp_path / "colab_runs" + rc = cmd_validate( + str(config_path), + "LLM4AD/benchmark_tasks", + bench=None, + strict=True, + runs_dir=str(runs_dir), + ) + assert rc == 0 + + run_dirs = [path for path in runs_dir.iterdir() if path.is_dir()] + assert run_dirs, "validate should create one run directory under --runs-dir" + manifest_path = run_dirs[0] / "meta" / "manifest.json" + assert manifest_path.exists() + diff --git a/tests/m1/test_veribench_cli.py b/tests/m1/test_veribench_cli.py new file mode 100644 index 0000000..086326b --- /dev/null +++ b/tests/m1/test_veribench_cli.py @@ -0,0 +1,15 @@ +from trace_bench.cli import cmd_list_tasks, cmd_validate + + +def test_veribench_list_tasks_does_not_fail(): + assert cmd_list_tasks("LLM4AD/benchmark_tasks", bench="veribench") == 0 + + +def test_veribench_validate_does_not_fail(tmp_path, capsys): + config_path = tmp_path / "veribench.yaml" + config_path.write_text( + "tasks:\n - id: veribench:smoke_placeholder\n", encoding="utf-8" + ) + assert cmd_validate(str(config_path), "LLM4AD/benchmark_tasks", bench="veribench") == 0 + out = capsys.readouterr().out + assert "[SKIP]" in out diff --git a/tests/test_lite_optimize_llm4ad.py b/tests/test_lite_optimize_llm4ad.py index 39df40c..03994ca 100644 --- a/tests/test_lite_optimize_llm4ad.py +++ b/tests/test_lite_optimize_llm4ad.py @@ -90,6 +90,9 @@ def _get_param_value(param): @pytest.mark.parametrize("task", TASKS) def test_lite_optimize_llm4ad_task(task): + if not os.environ.get("OPENAI_API_KEY"): + pytest.skip("OPENAI_API_KEY not set; skipping LLM-backed optimizer test.") + try: llm4ad_loader = _import_llm4ad_loader() except Exception as exc: diff --git a/trace_bench/__init__.py b/trace_bench/__init__.py new file mode 100644 index 0000000..5899023 --- /dev/null +++ b/trace_bench/__init__.py @@ -0,0 +1,6 @@ +ο»Ώ"""Trace-Bench runner package.""" + +from .config import RunConfig, load_config +from .runner import BenchRunner + +__all__ = ["RunConfig", "load_config", "BenchRunner"] diff --git a/trace_bench/__main__.py b/trace_bench/__main__.py new file mode 100644 index 0000000..6dbaea4 --- /dev/null +++ b/trace_bench/__main__.py @@ -0,0 +1,4 @@ +ο»Ώfrom trace_bench.cli import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/trace_bench/artifacts.py b/trace_bench/artifacts.py new file mode 100644 index 0000000..47566a3 --- /dev/null +++ b/trace_bench/artifacts.py @@ -0,0 +1,260 @@ +ο»Ώfrom __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional +import csv +import json +import os +import re +import subprocess +from datetime import datetime +import platform +import sys + + +@dataclass +class RunArtifacts: + run_dir: Path + meta_dir: Path + jobs_dir: Path + + @property + def config_snapshot(self) -> Path: + return self.meta_dir / "config.snapshot.yaml" + + @property + def env_json(self) -> Path: + return self.meta_dir / "env.json" + + @property + def git_json(self) -> Path: + return self.meta_dir / "git.json" + + @property + def manifest_json(self) -> Path: + return self.meta_dir / "manifest.json" + + @property + def results_csv(self) -> Path: + return self.run_dir / "results.csv" + + @property + def summary_json(self) -> Path: + return self.run_dir / "summary.json" + + +@dataclass +class JobArtifacts: + job_dir: Path + + @property + def job_meta(self) -> Path: + return self.job_dir / "job_meta.json" + + @property + def results_json(self) -> Path: + return self.job_dir / "results.json" + + @property + def events_jsonl(self) -> Path: + return self.job_dir / "events.jsonl" + + @property + def artifacts_dir(self) -> Path: + return self.job_dir / "artifacts" + + @property + def tb_dir(self) -> Path: + return self.job_dir / "tb" + + +def init_run_dir(runs_dir: str, run_id: str) -> RunArtifacts: + run_path = Path(runs_dir) / run_id + meta_dir = run_path / "meta" + jobs_dir = run_path / "jobs" + meta_dir.mkdir(parents=True, exist_ok=True) + jobs_dir.mkdir(parents=True, exist_ok=True) + return RunArtifacts(run_dir=run_path, meta_dir=meta_dir, jobs_dir=jobs_dir) + + +def init_job_dir(run_artifacts: RunArtifacts, job_id: str) -> JobArtifacts: + job_dir = run_artifacts.jobs_dir / job_id + job_dir.mkdir(parents=True, exist_ok=True) + (job_dir / "artifacts").mkdir(parents=True, exist_ok=True) + (job_dir / "tb").mkdir(parents=True, exist_ok=True) + return JobArtifacts(job_dir=job_dir) + + +def _dump_yaml_or_json(data: Dict[str, Any]) -> str: + try: + import yaml # type: ignore + return yaml.safe_dump(data, sort_keys=False) + except Exception: + return json.dumps(data, indent=2, sort_keys=False) + + +def write_config_snapshot(path: Path, data: Dict[str, Any]) -> None: + path.write_text(_dump_yaml_or_json(data), encoding="utf-8") + + +def _git_info() -> Dict[str, Any]: + info: Dict[str, Any] = {} + try: + root = Path(__file__).resolve().parents[1] + info["repo_root"] = str(root) + info["commit"] = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=root).decode().strip() + info["branch"] = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=root).decode().strip() + return info + except Exception: + return info + + +_ENV_ALLOWLIST = { + "TRACE_DEFAULT_LLM_BACKEND", + "TRACE_LITELLM_MODEL", + "TRACE_CUSTOMLLM_MODEL", + "TRACE_CUSTOMLLM_URL", + "CUDA_VISIBLE_DEVICES", + "PYTHONPATH", +} + +_ENV_PREFIX_ALLOWLIST = ( + "TRACE_", + "OPENAI_", + "ANTHROPIC_", + "AZURE_", + "HF_", + "HUGGINGFACE_", +) + +_SENSITIVE_TOKENS = ("KEY", "TOKEN", "SECRET", "PASSWORD") + + +def _is_allowed_env_key(key: str) -> bool: + if key in _ENV_ALLOWLIST: + return True + return any(key.startswith(prefix) for prefix in _ENV_PREFIX_ALLOWLIST) + + +def _redact_env_value(key: str, value: str) -> str: + if any(token in key.upper() for token in _SENSITIVE_TOKENS): + return "***REDACTED***" + return value + + +def write_env_json(path: Path) -> None: + env: Dict[str, str] = {} + for key in sorted(os.environ.keys()): + if _is_allowed_env_key(key): + env[key] = _redact_env_value(key, os.environ.get(key, "")) + payload = { + "captured_at": datetime.utcnow().isoformat() + "Z", + "env": env, + "runtime": { + "python_version": sys.version.split()[0], + "platform": platform.platform(), + }, + } + path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + + +def write_git_json(path: Path) -> None: + path.write_text(json.dumps(_git_info(), indent=2), encoding="utf-8") + + +_OBJECT_REPR_PATTERN = re.compile(r"<([^>]+) object at 0x[0-9A-Fa-f]+>") +_SENSITIVE_FIELD_TOKENS = ("KEY", "TOKEN", "SECRET", "PASSWORD") + + +def _sanitize_string(value: str) -> str: + return _OBJECT_REPR_PATTERN.sub(r"<\1>", value) + + +def sanitize_for_json(value: Any) -> Any: + if value is None or isinstance(value, (bool, int, float)): + return value + if isinstance(value, str): + return _sanitize_string(value) + if isinstance(value, Path): + return str(value) + if isinstance(value, dict): + sanitized: Dict[str, Any] = {} + for key, item in value.items(): + key_str = str(key) + if any(token in key_str.upper() for token in _SENSITIVE_FIELD_TOKENS): + sanitized[key_str] = "***REDACTED***" + else: + sanitized[key_str] = sanitize_for_json(item) + return sanitized + if isinstance(value, (list, tuple, set)): + return [sanitize_for_json(item) for item in value] + + metadata: Dict[str, Any] = { + "__class__": value.__class__.__name__, + "__module__": value.__class__.__module__, + } + for attr in ("model_name", "model", "provider", "backend", "name"): + try: + attr_value = getattr(value, attr) + except Exception: + continue + if attr_value is None: + continue + if isinstance(attr_value, (str, int, float, bool)): + metadata[attr] = sanitize_for_json(attr_value) + elif isinstance(attr_value, Path): + metadata[attr] = str(attr_value) + return metadata + + +def _dump_json(payload: Dict[str, Any]) -> str: + return json.dumps(sanitize_for_json(payload), indent=2, ensure_ascii=False) + + +def write_manifest(path: Path, manifest: Dict[str, Any]) -> None: + path.write_text(_dump_json(manifest), encoding="utf-8") + + +def write_job_meta(path: Path, job_meta: Dict[str, Any]) -> None: + path.write_text(_dump_json(job_meta), encoding="utf-8") + + +def write_job_results(path: Path, results: Dict[str, Any]) -> None: + path.write_text(_dump_json(results), encoding="utf-8") + + +def append_results_csv(path: Path, fieldnames: List[str], row: Dict[str, Any]) -> None: + write_header = not path.exists() + with path.open("a", encoding="utf-8", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + if write_header: + writer.writeheader() + writer.writerow(row) + + +def append_event(path: Path, event: Dict[str, Any]) -> None: + with path.open("a", encoding="utf-8") as f: + f.write(json.dumps(sanitize_for_json(event), ensure_ascii=False) + "\n") + + +def write_summary(path: Path, summary: Dict[str, Any]) -> None: + path.write_text(_dump_json(summary), encoding="utf-8") + + +__all__ = [ + "RunArtifacts", + "JobArtifacts", + "init_run_dir", + "init_job_dir", + "write_config_snapshot", + "write_env_json", + "write_git_json", + "write_manifest", + "write_job_meta", + "write_job_results", + "append_results_csv", + "append_event", + "write_summary", + "sanitize_for_json", +] diff --git a/trace_bench/cli.py b/trace_bench/cli.py new file mode 100644 index 0000000..6340136 --- /dev/null +++ b/trace_bench/cli.py @@ -0,0 +1,319 @@ +ο»Ώfrom __future__ import annotations + +import argparse +import json +from datetime import datetime +from pathlib import Path +import sys + +from trace_bench.config import load_config +from trace_bench.matrix import compute_run_id, expand_matrix +from trace_bench.registry import discover_tasks, discover_trainers, load_task_bundle +from trace_bench.resolve import merge_kwargs, resolve_trainer_kwargs +from trace_bench.runner import BenchRunner, _has_trainables +from trace_bench.artifacts import init_run_dir, write_manifest +from trace_bench.ui import launch_ui + + +def cmd_list_tasks(root: str, bench: str | None = None) -> int: + specs = discover_tasks(root, bench=bench) + for spec in specs: + print(spec.id) + return 0 + + +def cmd_list_trainers(include_all: bool = False) -> int: + specs = discover_trainers() + for spec in specs: + if not include_all and not spec.available: + continue + status = "available" if spec.available else "unavailable" + print(f"{spec.id}\t{status}") + return 0 + + +def _task_in_bench(task_key: str, bench: str | None) -> bool: + if not bench: + return True + if ":" not in task_key: + task_key = f"llm4ad:{task_key}" + if "veribench" in bench and task_key.startswith("veribench:"): + return True + if "trace_examples" in bench and task_key.startswith("trace_examples:"): + return True + if "internal" in bench and task_key.startswith("internal:"): + return True + if "llm4ad" in bench and task_key.startswith("llm4ad:"): + return True + return False + + +_ALLOWED_TRAINER_KWARGS = { + "threads", + "num_threads", + "num_epochs", + "num_steps", + "num_batches", + "num_candidates", + "num_proposals", + "num_iters", + "num_search_iterations", + "train_batch_size", + "merge_every", + "pareto_subset_size", + "ps_steps", + "ps_batches", + "ps_candidates", + "ps_proposals", + "ps_mem_update", + "gepa_iters", + "gepa_train_bs", + "gepa_merge_every", + "gepa_pareto_subset", + # LLM4AD pass-through knobs (merged into params_variants by config parser) + "optimizer_kwargs", + "eval_kwargs", +} + + +def _resolve_symbol(module_name: str, symbol: str) -> bool: + try: + module = __import__(module_name, fromlist=[symbol]) + return hasattr(module, symbol) + except Exception: + return False + + +def _validate_trainer_params(trainer, errors: list[str]) -> None: + for params in trainer.params_variants or [{}]: + for key in params.keys(): + if key not in _ALLOWED_TRAINER_KWARGS: + errors.append(f"unknown trainer kwarg '{key}' for {trainer.id}") + + if trainer.optimizer and not _resolve_symbol("opto.optimizers", trainer.optimizer): + errors.append(f"optimizer not found: {trainer.optimizer}") + if trainer.guide and not _resolve_symbol("opto.trainer.guide", trainer.guide): + errors.append(f"guide not found: {trainer.guide}") + if trainer.logger and not _resolve_symbol("opto.trainer.loggers", trainer.logger): + errors.append(f"logger not found: {trainer.logger}") + + +def cmd_validate( + config_path: str, + root: str, + bench: str | None = None, + strict: bool = False, + runs_dir: str | None = None, +) -> int: + cfg = load_config(config_path) + if runs_dir: + cfg.runs_dir = runs_dir + tasks_root = Path(root) + errors = 0 + if bench: + discover_tasks(tasks_root, bench=bench) + trainers = discover_trainers() + trainer_ids = {t.id for t in trainers if t.available} + strict_errors: list[str] = [] + for trainer in cfg.trainers: + if trainer.id not in trainer_ids: + errors += 1 + print(f"[FAIL] trainer {trainer.id}: not available") + if strict: + _validate_trainer_params(trainer, strict_errors) + if strict_errors: + for msg in strict_errors: + print(f"[FAIL] {msg}") + errors += len(strict_errors) + + bundle_cache: dict[str, dict | None] = {} + + def _bundle_cache_key(task) -> str: + eval_sig = json.dumps(task.eval_kwargs or {}, sort_keys=True) + return f"{task.id}|{eval_sig}" + + def _cache_bundle(task, bundle): + bundle_cache[_bundle_cache_key(task)] = bundle + + def _get_cached_bundle(task): + key = _bundle_cache_key(task) + if key in bundle_cache: + return bundle_cache[key] + try: + bundle = load_task_bundle(task.id, tasks_root, eval_kwargs=task.eval_kwargs) + _cache_bundle(task, bundle) + except Exception: + bundle_cache[key] = None + return bundle_cache.get(key) + + for task in cfg.tasks: + task_id = task.id + if not _task_in_bench(task_id, bench): + continue + try: + bundle = load_task_bundle(task_id, tasks_root, eval_kwargs=task.eval_kwargs) + _cache_bundle(task, bundle) + print(f"[OK] {task_id}") + if strict: + if not _has_trainables(bundle["param"]): + if task_id == "internal:non_trainable": + print(f"[EXPECTED] {task_id}: no_trainable_parameters") + else: + errors += 1 + print(f"[FAIL] {task_id}: no_trainable_parameters") + except NotImplementedError as exc: + print(f"[SKIP] {task_id}: {exc}") + except Exception as exc: + errors += 1 + print(f"[FAIL] {task_id}: {exc}") + + if strict: + jobs = expand_matrix(cfg) + if not jobs: + errors += 1 + print("[FAIL] matrix: no jobs expanded") + else: + print(f"\n[OK] matrix: {len(jobs)} jobs expanded deterministically") + seen_trainers: set[str] = set() + seen_tasks: set[str] = set() + for job in jobs: + seen_trainers.add(job.trainer_id) + seen_tasks.add(job.task_id) + print(f" job {job.job_id}: {job.task_id} x {job.trainer_id} (seed={job.seed})") + print(f"\n tasks: {sorted(seen_tasks)}") + print(f" trainers: {sorted(seen_trainers)}") + run_id = compute_run_id(cfg.snapshot()) + artifacts = init_run_dir(cfg.runs_dir, run_id) + manifest_jobs = [] + for job in jobs: + bundle = _get_cached_bundle(job.task) + status_hint = "ok" + skip_reason = "" + if bundle is None: + try: + bundle = load_task_bundle(job.task_id, tasks_root, eval_kwargs=job.task.eval_kwargs) + _cache_bundle(job.task, bundle) + except NotImplementedError as exc: + status_hint = "skipped" + skip_reason = str(exc) + except Exception as exc: + status_hint = "failed" + skip_reason = f"task_load_error: {exc}" + + manifest_jobs.append( + { + "job_id": job.job_id, + "task_id": job.task_id, + "suite": job.suite, + "trainer_id": job.trainer_id, + "seed": job.seed, + "raw_params": dict(job.params), + "resolved_trainer_kwargs": resolve_trainer_kwargs(job.params, job.trainer_id), + "resolved_optimizer_kwargs": merge_kwargs( + (bundle or {}).get("optimizer_kwargs", {}), + job.trainer.optimizer_kwargs or {}, + ), + "resolved_guide_kwargs": merge_kwargs( + (bundle or {}).get("guide_kwargs"), + job.trainer.guide_kwargs or {}, + ), + "resolved_logger_kwargs": merge_kwargs( + (bundle or {}).get("logger_kwargs"), + job.trainer.logger_kwargs or {}, + ), + "eval_kwargs": dict(job.task.eval_kwargs or {}), + "status_hint": status_hint, + "skip_reason": skip_reason, + } + ) + manifest = { + "run_id": run_id, + "generated_at": datetime.utcnow().isoformat() + "Z", + "jobs": manifest_jobs, + } + write_manifest(artifacts.manifest_json, manifest) + print(f"[OK] manifest written: {artifacts.manifest_json}") + return 1 if errors else 0 + + +def cmd_run( + config_path: str, + root: str, + runs_dir: str | None = None, + max_workers: int | None = None, +) -> int: + cfg = load_config(config_path) + if runs_dir: + cfg.runs_dir = runs_dir + if max_workers is not None: + cfg.max_workers = max_workers + runner = BenchRunner(cfg, tasks_root=root) + runner.run() + return 0 + + +def cmd_ui(runs_dir: str) -> int: + return launch_ui(runs_dir) + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="trace-bench") + sub = parser.add_subparsers(dest="cmd", required=True) + + list_p = sub.add_parser("list-tasks", help="List discoverable tasks") + list_p.add_argument("--root", default="LLM4AD/benchmark_tasks") + list_p.add_argument( + "--bench", + "--dataset-name", + dest="bench", + default=None, + help="Bench selection: llm4ad,trace_examples,internal,veribench", + ) + + list_t = sub.add_parser("list-trainers", help="List discoverable trainers") + list_t.add_argument("--all", action="store_true", help="Include unavailable trainers") + + val_p = sub.add_parser("validate", help="Validate tasks in config") + val_p.add_argument("--config", required=True) + val_p.add_argument("--root", default="LLM4AD/benchmark_tasks") + val_p.add_argument( + "--bench", + "--dataset-name", + dest="bench", + default=None, + help="Bench selection: llm4ad,trace_examples,internal,veribench", + ) + val_p.add_argument("--strict", action="store_true") + val_p.add_argument("--runs-dir", "--output-dir", dest="runs_dir", default=None) + + run_p = sub.add_parser("run", help="Run a benchmark config") + run_p.add_argument("--config", required=True) + run_p.add_argument("--root", default="LLM4AD/benchmark_tasks") + run_p.add_argument("--runs-dir", "--output-dir", dest="runs_dir", default=None) + run_p.add_argument("--max-workers", "--n-concurrent", dest="max_workers", type=int, default=None) + + ui_p = sub.add_parser("ui", help="Launch Gradio UI (stub)") + ui_p.add_argument("--runs-dir", default="runs") + + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + if args.cmd == "list-tasks": + return cmd_list_tasks(args.root, args.bench) + if args.cmd == "list-trainers": + return cmd_list_trainers(args.all) + if args.cmd == "validate": + return cmd_validate(args.config, args.root, args.bench, args.strict, args.runs_dir) + if args.cmd == "run": + return cmd_run(args.config, args.root, args.runs_dir, args.max_workers) + if args.cmd == "ui": + return cmd_ui(args.runs_dir) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/trace_bench/config.py b/trace_bench/config.py new file mode 100644 index 0000000..6d89237 --- /dev/null +++ b/trace_bench/config.py @@ -0,0 +1,233 @@ +ο»Ώfrom __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional +import json +import uuid + + +_LLM4AD_KNOBS = { + "threads", + "num_threads", + "optimizer_kwargs", + "eval_kwargs", + "ps_steps", + "ps_batches", + "ps_candidates", + "ps_proposals", + "ps_mem_update", + "gepa_iters", + "gepa_train_bs", + "gepa_merge_every", + "gepa_pareto_subset", +} + + +def _load_text(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +def _load_yaml_or_json(path: Path) -> Dict[str, Any]: + text = _load_text(path) + # Prefer YAML if available + try: + import yaml # type: ignore + data = yaml.safe_load(text) + if data is None: + return {} + if not isinstance(data, dict): + raise ValueError("Config must be a mapping at top-level") + return data + except Exception: + # Fallback to JSON for environments without PyYAML + try: + data = json.loads(text) + if not isinstance(data, dict): + raise ValueError("Config must be a mapping at top-level") + return data + except json.JSONDecodeError as exc: + raise ValueError( + f"Failed to parse config {path}. Install PyYAML or use JSON syntax. Error: {exc}" + ) + + +def _as_dict(value: Optional[Dict[str, Any]]) -> Dict[str, Any]: + return dict(value or {}) + + +def _normalize_key(key: str) -> str: + return key.replace("-", "_") + + +def _extract_llm4ad_knobs(data: Dict[str, Any]) -> Dict[str, Any]: + knobs: Dict[str, Any] = {} + for raw_key, value in data.items(): + key = _normalize_key(raw_key) + if key in _LLM4AD_KNOBS: + knobs[key] = value + return knobs + + +@dataclass +class TaskConfig: + id: str + eval_kwargs: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class TrainerConfig: + id: str + params_variants: List[Dict[str, Any]] = field(default_factory=list) + optimizer: Optional[str] = None + optimizer_kwargs: Dict[str, Any] = field(default_factory=dict) + guide: Optional[str] = None + guide_kwargs: Dict[str, Any] = field(default_factory=dict) + logger: Optional[str] = None + logger_kwargs: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class RunConfig: + run_id: Optional[str] = None + runs_dir: str = "runs" + mode: str = "stub" + seeds: List[int] = field(default_factory=lambda: [123]) + max_workers: int = 1 + fail_fast: bool = False + tasks: List[TaskConfig] = field(default_factory=list) + trainers: List[TrainerConfig] = field(default_factory=list) + eval_kwargs: Dict[str, Any] = field(default_factory=dict) + trainer_kwargs: Dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "RunConfig": + runs_dir = data.get("runs_dir", data.get("runs_root", "runs")) + mode = data.get("mode", "stub") + seeds = data.get("seeds") + if seeds is None: + seed = int(data.get("seed", 123)) + seeds = [seed] + else: + seeds = [int(x) for x in (seeds or [])] or [123] + + if "max_workers" in data: + max_workers = data.get("max_workers") + else: + max_workers = data.get("n_concurrent", data.get("n-concurrent", 1)) + max_workers = int(max_workers) + fail_fast = bool(data.get("fail_fast", False)) + + default_eval = _as_dict(data.get("eval_kwargs")) + default_trainer_kwargs = _as_dict(data.get("trainer_kwargs")) + default_trainer_kwargs.update(_extract_llm4ad_knobs(data)) + + tasks: List[TaskConfig] = [] + for item in list(data.get("tasks", []) or []): + if isinstance(item, str): + tasks.append(TaskConfig(id=item, eval_kwargs=dict(default_eval))) + elif isinstance(item, dict): + task_id = item.get("id") or item.get("key") or item.get("task") + if not task_id: + raise ValueError(f"Task entry missing id: {item}") + eval_kwargs = dict(default_eval) + eval_kwargs.update(_as_dict(item.get("eval_kwargs"))) + tasks.append(TaskConfig(id=str(task_id), eval_kwargs=eval_kwargs)) + else: + raise ValueError(f"Unsupported task entry: {item}") + + trainers: List[TrainerConfig] = [] + for item in list(data.get("trainers", []) or []): + if isinstance(item, str): + params_variants = [dict(default_trainer_kwargs)] + trainers.append(TrainerConfig(id=item, params_variants=params_variants)) + continue + if not isinstance(item, dict): + raise ValueError(f"Unsupported trainer entry: {item}") + + trainer_id = item.get("id") or item.get("name") or item.get("trainer") or item.get("key") + if not trainer_id: + raise ValueError(f"Trainer entry missing id: {item}") + + params_variants = item.get("params_variants") + if params_variants is None: + params = item.get("params") or item.get("trainer_kwargs") or {} + params_variants = [params] + normalized_variants: List[Dict[str, Any]] = [] + for variant in list(params_variants or [{}]): + merged = dict(default_trainer_kwargs) + merged.update(_extract_llm4ad_knobs(item)) + merged.update(dict(variant or {})) + normalized_variants.append(merged) + + trainers.append( + TrainerConfig( + id=str(trainer_id), + params_variants=normalized_variants, + optimizer=item.get("optimizer"), + optimizer_kwargs=_as_dict(item.get("optimizer_kwargs")), + guide=item.get("guide"), + guide_kwargs=_as_dict(item.get("guide_kwargs")), + logger=item.get("logger"), + logger_kwargs=_as_dict(item.get("logger_kwargs")), + ) + ) + + if not trainers: + trainers = [TrainerConfig(id="PrioritySearch", params_variants=[dict(default_trainer_kwargs)])] + + return cls( + run_id=data.get("run_id"), + runs_dir=runs_dir, + mode=mode, + seeds=seeds, + max_workers=max_workers, + fail_fast=fail_fast, + tasks=tasks, + trainers=trainers, + eval_kwargs=default_eval, + trainer_kwargs=default_trainer_kwargs, + ) + + def ensure_run_id(self) -> str: + if not self.run_id: + self.run_id = str(uuid.uuid4()) + return self.run_id + + def snapshot(self) -> Dict[str, Any]: + return { + "run_id": self.run_id, + "runs_dir": self.runs_dir, + "mode": self.mode, + "seeds": list(self.seeds), + "max_workers": self.max_workers, + "fail_fast": self.fail_fast, + "tasks": [ + {"id": task.id, "eval_kwargs": dict(task.eval_kwargs)} + for task in self.tasks + ], + "trainers": [ + { + "id": trainer.id, + "params_variants": [dict(p) for p in trainer.params_variants], + "optimizer": trainer.optimizer, + "optimizer_kwargs": dict(trainer.optimizer_kwargs), + "guide": trainer.guide, + "guide_kwargs": dict(trainer.guide_kwargs), + "logger": trainer.logger, + "logger_kwargs": dict(trainer.logger_kwargs), + } + for trainer in self.trainers + ], + "eval_kwargs": dict(self.eval_kwargs), + "trainer_kwargs": dict(self.trainer_kwargs), + } + + +def load_config(path: str | Path) -> RunConfig: + config_path = Path(path) + data = _load_yaml_or_json(config_path) + return RunConfig.from_dict(data) + + +__all__ = ["RunConfig", "TaskConfig", "TrainerConfig", "load_config"] diff --git a/trace_bench/examples/__init__.py b/trace_bench/examples/__init__.py new file mode 100644 index 0000000..83e54f4 --- /dev/null +++ b/trace_bench/examples/__init__.py @@ -0,0 +1 @@ +ο»Ώ"""Example tasks for Trace-Bench.""" diff --git a/trace_bench/examples/greeting_stub.py b/trace_bench/examples/greeting_stub.py new file mode 100644 index 0000000..9c119f8 --- /dev/null +++ b/trace_bench/examples/greeting_stub.py @@ -0,0 +1,49 @@ +ο»Ώfrom __future__ import annotations + +from opto import trace +from opto.trainer.guide import Guide + + +class ExactMatchGuide(Guide): + def get_feedback(self, query: str, response: str, reference: str, **kwargs): + score = 1.0 if response == reference else 0.0 + feedback = "Correct" if score == 1.0 else f"Expected: {reference}" + return score, feedback + + +@trace.model +class GreetingAgent: + def __init__(self): + self.greeting = trace.node("Hello", trainable=True) + + def __call__(self, user_query: str): + name = user_query.split()[-1].strip("!.?") + return self.compose(self.greeting, name) + + @trace.bundle(trainable=True) + def compose(self, greeting, name: str): + greeting_value = getattr(greeting, "data", greeting) + return f"{greeting_value}, {name}!" + + +def build_trace_problem(**override_eval_kwargs): + agent = GreetingAgent() + guide = ExactMatchGuide() + train_dataset = dict( + inputs=["Hello I am Sam"], + infos=["Hello, Sam!"], + ) + optimizer_kwargs = dict( + objective="Generate a correct greeting using the name from the query.", + memory_size=5, + ) + return dict( + param=agent, + guide=guide, + train_dataset=train_dataset, + optimizer_kwargs=optimizer_kwargs, + metadata=dict(benchmark="example", entry="GreetingAgent"), + ) + + +__all__ = ["build_trace_problem", "GreetingAgent"] diff --git a/trace_bench/examples/internal_code_param.py b/trace_bench/examples/internal_code_param.py new file mode 100644 index 0000000..c9c78ce --- /dev/null +++ b/trace_bench/examples/internal_code_param.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from opto import trace +from opto.trainer.guide import Guide + + +class CodeExactGuide(Guide): + def get_feedback(self, _query, response, reference, **_kwargs): + score = 1.0 if response == reference else 0.0 + feedback = "Correct" if score == 1.0 else "Mismatch" + return score, feedback + + +@trace.model +class CodeParamAgent: + def __init__(self): + self.code = trace.node("def f(x): return x", trainable=True) + + def __call__(self, _input): + return self.emit(self.code) + + @trace.bundle(trainable=True) + def emit(self, code): + return code + + +def build_trace_problem(**_override_eval_kwargs): + agent = CodeParamAgent() + guide = CodeExactGuide() + train_dataset = dict(inputs=[None], infos=["def f(x): return x"]) + optimizer_kwargs = dict(objective="Match the target code exactly.", memory_size=5) + return dict( + param=agent, + guide=guide, + train_dataset=train_dataset, + optimizer_kwargs=optimizer_kwargs, + metadata=dict(benchmark="internal", entry="CodeParamAgent"), + ) + + +__all__ = ["build_trace_problem", "CodeParamAgent"] diff --git a/trace_bench/examples/internal_multi_param.py b/trace_bench/examples/internal_multi_param.py new file mode 100644 index 0000000..d598954 --- /dev/null +++ b/trace_bench/examples/internal_multi_param.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from opto import trace +from opto.trainer.guide import Guide + + +class SumGuide(Guide): + def get_feedback(self, _query, response, reference, **_kwargs): + try: + score = -abs(float(response) - float(reference)) + except Exception: + score = -1.0 + feedback = f"target={reference}" + return score, feedback + + +@trace.model +class MultiParamAgent: + def __init__(self): + self.a = trace.node(1.0, trainable=True) + self.b = trace.node(1.0, trainable=True) + + def __call__(self, _input): + return self.combine(self.a, self.b) + + @trace.bundle(trainable=True) + def combine(self, a, b): + return float(getattr(a, "data", a)) + float(getattr(b, "data", b)) + + +def build_trace_problem(**_override_eval_kwargs): + agent = MultiParamAgent() + guide = SumGuide() + train_dataset = dict(inputs=[None], infos=[3.0]) + optimizer_kwargs = dict(objective="Make a+b match the target value.", memory_size=5) + return dict( + param=agent, + guide=guide, + train_dataset=train_dataset, + optimizer_kwargs=optimizer_kwargs, + metadata=dict(benchmark="internal", entry="MultiParamAgent"), + ) + + +__all__ = ["build_trace_problem", "MultiParamAgent"] diff --git a/trace_bench/examples/internal_non_trainable.py b/trace_bench/examples/internal_non_trainable.py new file mode 100644 index 0000000..08cec8b --- /dev/null +++ b/trace_bench/examples/internal_non_trainable.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from opto import trace +from opto.trainer.guide import Guide + + +class NoTrainGuide(Guide): + def get_feedback(self, _query, response, reference, **_kwargs): + score = 1.0 if response == reference else 0.0 + feedback = "Correct" if score == 1.0 else "Mismatch" + return score, feedback + + +@trace.model +class NonTrainableAgent: + def __init__(self): + self.value = trace.node("fixed", trainable=False) + + def __call__(self, _input): + return self.emit(self.value) + + @trace.bundle(trainable=False) + def emit(self, value): + return value + + +def build_trace_problem(**_override_eval_kwargs): + agent = NonTrainableAgent() + guide = NoTrainGuide() + train_dataset = dict(inputs=[None], infos=["fixed"]) + optimizer_kwargs = dict(objective="This should fail due to no trainables.", memory_size=1) + return dict( + param=agent, + guide=guide, + train_dataset=train_dataset, + optimizer_kwargs=optimizer_kwargs, + metadata=dict(benchmark="internal", entry="NonTrainableAgent"), + ) + + +__all__ = ["build_trace_problem", "NonTrainableAgent"] diff --git a/trace_bench/examples/internal_numeric_param.py b/trace_bench/examples/internal_numeric_param.py new file mode 100644 index 0000000..22d1a21 --- /dev/null +++ b/trace_bench/examples/internal_numeric_param.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from opto import trace +from opto.trainer.guide import Guide + + +class NumericGuide(Guide): + def get_feedback(self, _query, response, reference, **_kwargs): + try: + score = -abs(float(response) - float(reference)) + except Exception: + score = -1.0 + feedback = f"target={reference}" + return score, feedback + + +@trace.model +class NumericParamAgent: + def __init__(self): + self.value = trace.node(0.0, trainable=True) + + def __call__(self, _input): + return self.emit(self.value) + + @trace.bundle(trainable=True) + def emit(self, value): + return value + + +def build_trace_problem(**_override_eval_kwargs): + agent = NumericParamAgent() + guide = NumericGuide() + train_dataset = dict(inputs=[None], infos=[3.0]) + optimizer_kwargs = dict(objective="Match the numeric target value.", memory_size=5) + return dict( + param=agent, + guide=guide, + train_dataset=train_dataset, + optimizer_kwargs=optimizer_kwargs, + metadata=dict(benchmark="internal", entry="NumericParamAgent"), + ) + + +__all__ = ["build_trace_problem", "NumericParamAgent"] diff --git a/trace_bench/examples/train_single_node_stub.py b/trace_bench/examples/train_single_node_stub.py new file mode 100644 index 0000000..e7f141f --- /dev/null +++ b/trace_bench/examples/train_single_node_stub.py @@ -0,0 +1,50 @@ +ο»Ώfrom __future__ import annotations + +from opto import trace +from opto.trainer.guide import Guide + + +class RegressionGuide(Guide): + def get_feedback(self, query, response, reference, **kwargs): + try: + score = -abs(float(response) - float(reference)) + except Exception: + score = -1.0 + feedback = f"target={reference}" + return score, feedback + + +@trace.model +class SingleNodeAgent: + def __init__(self): + self.guess = trace.node(0.0, trainable=True) + + def __call__(self, _input): + return self.output(self.guess) + + @trace.bundle(trainable=True) + def output(self, guess): + return guess + + +def build_trace_problem(**override_eval_kwargs): + agent = SingleNodeAgent() + guide = RegressionGuide() + train_dataset = dict( + inputs=[None], + infos=[3.0], + ) + optimizer_kwargs = dict( + objective="Match the target scalar value.", + memory_size=5, + ) + return dict( + param=agent, + guide=guide, + train_dataset=train_dataset, + optimizer_kwargs=optimizer_kwargs, + metadata=dict(benchmark="example", entry="SingleNodeAgent"), + ) + + +__all__ = ["build_trace_problem", "SingleNodeAgent"] diff --git a/trace_bench/matrix.py b/trace_bench/matrix.py new file mode 100644 index 0000000..ea0f232 --- /dev/null +++ b/trace_bench/matrix.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, List, Optional +import hashlib +import json +import subprocess + +from trace_bench.config import RunConfig, TaskConfig, TrainerConfig +from trace_bench.resolve import resolve_trainer_kwargs + + +def _git_sha() -> str: + try: + return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip() + except Exception: + return "unknown" + + +def _stable_hash(payload: Dict[str, Any], length: int = 8) -> str: + data = json.dumps(payload, sort_keys=True, default=str).encode("utf-8") + return hashlib.sha256(data).hexdigest()[:length] + + +def compute_run_id(config_snapshot: Dict[str, Any], git_sha: Optional[str] = None) -> str: + timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S") + payload = {"config": config_snapshot, "git": git_sha or _git_sha()} + return f"{timestamp}-{_stable_hash(payload, 8)}" + + +def compute_job_id(task_id: str, trainer_id: str, resolved_kwargs: Dict[str, Any], seed: int) -> str: + payload = { + "task_id": task_id, + "trainer_id": trainer_id, + "resolved_kwargs": resolved_kwargs, + "seed": seed, + } + return _stable_hash(payload, 12) + + +def task_suite(task_id: str) -> str: + if ":" in task_id: + return task_id.split(":", 1)[0] + return "llm4ad" + + +def resolve_job_kwargs(task: TaskConfig, trainer: TrainerConfig, params: Dict[str, Any]) -> Dict[str, Any]: + return { + "trainer_kwargs": resolve_trainer_kwargs(params, trainer.id), + "optimizer": trainer.optimizer, + "optimizer_kwargs": dict(trainer.optimizer_kwargs or {}), + "guide": trainer.guide, + "guide_kwargs": dict(trainer.guide_kwargs or {}), + "logger": trainer.logger, + "logger_kwargs": dict(trainer.logger_kwargs or {}), + "eval_kwargs": dict(task.eval_kwargs or {}), + } + + +@dataclass +class JobSpec: + job_id: str + task: TaskConfig + trainer: TrainerConfig + seed: int + params: Dict[str, Any] + resolved_kwargs: Dict[str, Any] + + @property + def task_id(self) -> str: + return self.task.id + + @property + def trainer_id(self) -> str: + return self.trainer.id + + @property + def suite(self) -> str: + return task_suite(self.task_id) + + +def expand_matrix(config: RunConfig) -> List[JobSpec]: + jobs: List[JobSpec] = [] + for task in config.tasks: + for trainer in config.trainers: + variants = trainer.params_variants or [{}] + for params in variants: + for seed in config.seeds: + resolved = resolve_job_kwargs(task, trainer, params) + job_id = compute_job_id(task.id, trainer.id, resolved, seed) + jobs.append( + JobSpec( + job_id=job_id, + task=task, + trainer=trainer, + seed=seed, + params=params, + resolved_kwargs=resolved, + ) + ) + return jobs diff --git a/trace_bench/registry.py b/trace_bench/registry.py new file mode 100644 index 0000000..8096e17 --- /dev/null +++ b/trace_bench/registry.py @@ -0,0 +1,284 @@ +ο»Ώfrom __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Set +import ast +import importlib +import importlib.util +import inspect +import json +import pkgutil +import sys + + +@dataclass +class TaskSpec: + id: str + suite: str + module: str + + +@dataclass +class TrainerSpec: + id: str + source: str + available: bool + + +_INTERNAL_TASKS = { + "internal:code_param": "internal_code_param", + "internal:numeric_param": "internal_numeric_param", + "internal:multi_param": "internal_multi_param", + "internal:non_trainable": "internal_non_trainable", +} + +_TRAINER_ALIASES = { + "GEPAAlgorithmBase": "GEPA-Base", + "GEPAUCBSearch": "GEPA-UCB", + "GEPABeamPareto": "GEPA-Beam", +} + +_VERIBENCH_UNAVAILABLE = ( + "veribench_unavailable: entrypoint not available (install Veribench or provide task list)" +) +_VERIBENCH_PLACEHOLDER = "veribench:smoke_placeholder" + +def _repo_root() -> Path: + return Path(__file__).resolve().parents[1] + + +def _ensure_sys_path(path: Path) -> None: + if path.exists(): + path_str = str(path) + if path_str not in sys.path: + sys.path.insert(0, path_str) + + +def ensure_opto_importable() -> None: + try: + import opto # noqa: F401 + return + except Exception: + pass + repo_root = _repo_root() + _ensure_sys_path(repo_root.parent / "OpenTrace") + + +def ensure_llm4ad_importable(tasks_root: Path) -> None: + _ensure_sys_path(_repo_root()) + _ensure_sys_path(tasks_root.parent) + # Provide llm4ad_loader alias for task imports + try: + module = importlib.import_module("LLM4AD.llm4ad_loader") + sys.modules.setdefault("llm4ad_loader", module) + except Exception: + pass + + +def _load_index(tasks_root: Path) -> List[Dict[str, Any]]: + index_path = tasks_root / "index.json" + if not index_path.exists(): + return [] + return json.loads(index_path.read_text(encoding="utf-8")) + + +def discover_llm4ad(tasks_root: Path) -> List[TaskSpec]: + specs: List[TaskSpec] = [] + index = _load_index(tasks_root) + if index: + for entry in index: + key = entry.get("key") + module = entry.get("module") or entry.get("wrapper") + if key and module: + specs.append(TaskSpec(id=f"llm4ad:{key}", suite="llm4ad", module=module)) + return specs + # fallback: directories + for path in tasks_root.iterdir(): + if path.is_dir(): + specs.append(TaskSpec(id=f"llm4ad:{path.name}", suite="llm4ad", module=path.name)) + return specs + + +def discover_trace_examples() -> List[TaskSpec]: + return [ + TaskSpec(id="trace_examples:greeting_stub", suite="trace_examples", module="greeting_stub"), + TaskSpec(id="trace_examples:train_single_node_stub", suite="trace_examples", module="train_single_node_stub"), + ] + + +def discover_internal() -> List[TaskSpec]: + return [ + TaskSpec(id=task_id, suite="internal", module=module) + for task_id, module in _INTERNAL_TASKS.items() + ] + +def discover_veribench() -> List[TaskSpec]: + # Always return a placeholder task so CLI/validate can skip with a reason. + if importlib.util.find_spec("veribench") is None: + return [TaskSpec(id=_VERIBENCH_PLACEHOLDER, suite="veribench", module="veribench_unavailable")] + # Entry point not wired yet; keep placeholder until a task list is provided. + return [TaskSpec(id=_VERIBENCH_PLACEHOLDER, suite="veribench", module="veribench_unavailable")] + + +def _iter_module_names(package_name: str) -> Iterable[str]: + try: + package = importlib.import_module(package_name) + except Exception: + return [] + names: List[str] = [package.__name__] + if hasattr(package, "__path__"): + for module_info in pkgutil.walk_packages(package.__path__, package.__name__ + "."): + names.append(module_info.name) + return names + + +def _class_names_from_file(module_name: str) -> List[str]: + spec = importlib.util.find_spec(module_name) + if spec is None or not spec.origin or not spec.origin.endswith(".py"): + return [] + try: + source = Path(spec.origin).read_text(encoding="utf-8") + tree = ast.parse(source) + except Exception: + return [] + names: List[str] = [] + for node in tree.body: + if not isinstance(node, ast.ClassDef): + continue + base_names: List[str] = [] + for base in node.bases: + if isinstance(base, ast.Name): + base_names.append(base.id) + elif isinstance(base, ast.Attribute): + base_names.append(base.attr) + if any(name.endswith("Trainer") or name.endswith("Algorithm") for name in base_names): + if node.name in {"Trainer", "Algorithm"}: + continue + names.append(node.name) + return names + + +def discover_trainers() -> List[TrainerSpec]: + ensure_opto_importable() + from opto.trainer.algorithms.algorithm import Trainer as TrainerBase + + specs: Dict[str, TrainerSpec] = {} + module_names: List[str] = [] + module_names.extend(_iter_module_names("opto.trainer.algorithms")) + module_names.extend(_iter_module_names("opto.features")) + + for module_name in sorted(set(module_names)): + try: + module = importlib.import_module(module_name) + except Exception: + for class_name in _class_names_from_file(module_name): + trainer_id = _TRAINER_ALIASES.get(class_name, class_name) + if trainer_id not in specs: + specs[trainer_id] = TrainerSpec(id=trainer_id, source=module_name, available=False) + continue + + for _name, obj in vars(module).items(): + if not inspect.isclass(obj): + continue + if obj is TrainerBase: + continue + if not issubclass(obj, TrainerBase): + continue + trainer_id = _TRAINER_ALIASES.get(obj.__name__, obj.__name__) + specs[trainer_id] = TrainerSpec(id=trainer_id, source=obj.__module__, available=True) + return sorted(specs.values(), key=lambda spec: spec.id) + + +def _parse_bench(bench: Optional[str]) -> Set[str]: + if not bench: + return {"llm4ad", "trace_examples", "internal"} + normalized = bench.replace("+", ",") + parts = [p.strip() for p in normalized.split(",") if p.strip()] + if not parts: + return {"llm4ad", "trace_examples", "internal"} + allowed = {"llm4ad", "trace_examples", "internal", "veribench"} + unknown = [p for p in parts if p not in allowed] + if unknown: + raise ValueError(f"Unknown bench selector(s): {unknown}. Allowed: {sorted(allowed)}") + return set(parts) + + +def discover_tasks(tasks_root: str | Path, bench: Optional[str] = None) -> List[TaskSpec]: + root = Path(tasks_root) + selected = _parse_bench(bench) + specs: List[TaskSpec] = [] + if "llm4ad" in selected: + specs.extend(discover_llm4ad(root)) + if "trace_examples" in selected: + specs.extend(discover_trace_examples()) + if "internal" in selected: + specs.extend(discover_internal()) + if "veribench" in selected: + specs.extend(discover_veribench()) + return specs + + +def _normalize_task_id(task_id: str) -> str: + if task_id.startswith("example:"): + return task_id.replace("example:", "trace_examples:", 1) + if ":" in task_id: + return task_id + return f"llm4ad:{task_id}" + + +def load_task_module(task_id: str, tasks_root: str | Path): + ensure_opto_importable() + root = Path(tasks_root) + task_id = _normalize_task_id(task_id) + if task_id.startswith("trace_examples:"): + module_name = task_id.split(":", 1)[1] + return importlib.import_module(f"trace_bench.examples.{module_name}") + if task_id.startswith("internal:"): + module_name = _INTERNAL_TASKS.get(task_id, task_id.split(":", 1)[1]) + return importlib.import_module(f"trace_bench.examples.{module_name}") + if task_id.startswith("veribench:"): + raise NotImplementedError(_VERIBENCH_UNAVAILABLE) + + ensure_llm4ad_importable(root) + mapping = {spec.id.split(":", 1)[1]: spec.module for spec in discover_llm4ad(root)} + task_key = task_id.split(":", 1)[1] + module_dir = mapping.get(task_key, task_key) + module_path = root / module_dir / "__init__.py" + if not module_path.exists(): + raise FileNotFoundError(f"Task module not found: {module_path}") + + module_name = f"trace_bench_task_{module_dir}_{abs(hash(str(module_path)))}" + spec = importlib.util.spec_from_file_location(module_name, str(module_path)) + if spec is None or spec.loader is None: + raise ImportError(f"Could not load spec for {module_path}") + mod = importlib.util.module_from_spec(spec) + sys.modules[module_name] = mod + spec.loader.exec_module(mod) + return mod + + +def load_task_bundle(task_id: str, tasks_root: str | Path, eval_kwargs: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + task_id = _normalize_task_id(task_id) + if task_id.startswith("veribench:"): + raise NotImplementedError(_VERIBENCH_UNAVAILABLE) + mod = load_task_module(task_id, tasks_root) + if not hasattr(mod, "build_trace_problem"): + raise AttributeError(f"Task module {task_id} missing build_trace_problem") + bundle = mod.build_trace_problem(**(eval_kwargs or {})) + required = {"param", "guide", "train_dataset", "optimizer_kwargs", "metadata"} + missing = required - set(bundle.keys()) + if missing: + raise KeyError(f"Task bundle missing keys: {sorted(missing)}") + return bundle + + +__all__ = [ + "TaskSpec", + "TrainerSpec", + "discover_tasks", + "discover_trainers", + "discover_veribench", + "load_task_bundle", + "load_task_module", +] diff --git a/trace_bench/resolve.py b/trace_bench/resolve.py new file mode 100644 index 0000000..e285341 --- /dev/null +++ b/trace_bench/resolve.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from typing import Any, Dict, List + + +_FILTERED_KWARGS = {"eval_kwargs", "optimizer_kwargs"} + + +def _default_trainer_kwargs(algo_name: str) -> Dict[str, Any]: + if algo_name == "PrioritySearch": + return dict(num_epochs=1, num_steps=1, num_batches=1, num_candidates=2, num_proposals=2) + if algo_name == "GEPA-Base": + return dict(num_iters=1, train_batch_size=2, merge_every=2, pareto_subset_size=2) + # GEPA-UCB and GEPA-Beam use num_search_iterations + return dict(num_search_iterations=1, train_batch_size=2, merge_every=2, pareto_subset_size=2) + + +def _param_alias_map(algo_name: str) -> Dict[str, str]: + base = { + "threads": "num_threads", + "ps_steps": "num_steps", + "ps_batches": "num_batches", + "ps_candidates": "num_candidates", + "ps_proposals": "num_proposals", + "ps_mem_update": "memory_update_frequency", + "gepa_train_bs": "train_batch_size", + "gepa_merge_every": "merge_every", + "gepa_pareto_subset": "pareto_subset_size", + } + if algo_name == "GEPA-Base": + base["gepa_iters"] = "num_iters" + else: + base["gepa_iters"] = "num_search_iterations" + return base + + +def resolve_trainer_kwargs(params: Dict[str, Any], algo_name: str) -> Dict[str, Any]: + kwargs = _default_trainer_kwargs(algo_name) + alias_map = _param_alias_map(algo_name) + for key, value in (params or {}).items(): + if key in _FILTERED_KWARGS: + continue + mapped_key = alias_map.get(key, key) + kwargs[mapped_key] = value + return kwargs + + +def _clone(value: Any) -> Any: + if isinstance(value, dict): + return {k: _clone(v) for k, v in value.items()} + if isinstance(value, list): + return [_clone(v) for v in value] + return value + + +def merge_kwargs(base: Any, override: Any) -> Any: + if override is None: + return _clone(base) + if base is None: + return _clone(override) + if isinstance(base, dict) and isinstance(override, dict): + merged = dict(base) + merged.update(override) + return merged + if isinstance(base, list) and isinstance(override, dict): + if not base: + return [_clone(override)] + return [ + merge_kwargs(item, override) if isinstance(item, (dict, list)) else _clone(item) + for item in base + ] + if isinstance(base, dict) and isinstance(override, list): + if not override: + return _clone(base) + return [ + merge_kwargs(base, item) if isinstance(item, (dict, list)) else _clone(item) + for item in override + ] + if isinstance(base, list) and isinstance(override, list): + merged: List[Any] = [] + max_len = max(len(base), len(override)) + for idx in range(max_len): + left = base[idx] if idx < len(base) else None + right = override[idx] if idx < len(override) else None + if left is None: + merged.append(_clone(right)) + elif right is None: + merged.append(_clone(left)) + else: + merged.append(merge_kwargs(left, right)) + return merged + return _clone(override) + + +__all__ = ["resolve_trainer_kwargs", "merge_kwargs"] diff --git a/trace_bench/results.py b/trace_bench/results.py new file mode 100644 index 0000000..2e307c6 --- /dev/null +++ b/trace_bench/results.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from typing import Any, Dict, List +import json + +from trace_bench.artifacts import sanitize_for_json + + +RESULT_COLUMNS = [ + "run_id", + "job_id", + "task_id", + "suite", + "trainer_id", + "seed", + "status", + "score_initial", + "score_final", + "score_best", + "time_seconds", + "resolved_trainer_kwargs", + "resolved_optimizer_kwargs", + "eval_kwargs", + "feedback", + "tb_logdir", +] + + +def _json_cell(value: Any) -> str: + return json.dumps(sanitize_for_json(value), sort_keys=True, ensure_ascii=False) + + +def build_results_row( + run_id: str, + job_id: str, + task_id: str, + suite: str, + trainer_id: str, + seed: int, + status: str, + score_initial: Any, + score_final: Any, + score_best: Any, + time_seconds: float, + resolved_trainer_kwargs: Dict[str, Any], + resolved_optimizer_kwargs: Dict[str, Any], + eval_kwargs: Dict[str, Any], + feedback: str | None, + tb_logdir: str, +) -> Dict[str, Any]: + return { + "run_id": run_id, + "job_id": job_id, + "task_id": task_id, + "suite": suite, + "trainer_id": trainer_id, + "seed": seed, + "status": status, + "score_initial": score_initial, + "score_final": score_final, + "score_best": score_best, + "time_seconds": round(time_seconds, 6), + "resolved_trainer_kwargs": resolved_trainer_kwargs, + "resolved_optimizer_kwargs": resolved_optimizer_kwargs, + "eval_kwargs": eval_kwargs, + "feedback": feedback or "", + "tb_logdir": tb_logdir, + } + + +def build_results_csv_row(row: Dict[str, Any]) -> Dict[str, Any]: + csv_row = dict(row) + csv_row["resolved_trainer_kwargs"] = _json_cell(row.get("resolved_trainer_kwargs")) + csv_row["resolved_optimizer_kwargs"] = _json_cell(row.get("resolved_optimizer_kwargs")) + csv_row["eval_kwargs"] = _json_cell(row.get("eval_kwargs")) + return csv_row + + +def summarize_results(rows: List[Dict[str, Any]]) -> Dict[str, Any]: + counts: Dict[str, int] = {"ok": 0, "failed": 0, "skipped": 0} + for row in rows: + status = row.get("status") or "ok" + if status not in counts: + counts[status] = 0 + counts[status] += 1 + return {"counts": counts, "total_jobs": len(rows)} + + +__all__ = ["RESULT_COLUMNS", "build_results_row", "build_results_csv_row", "summarize_results"] diff --git a/trace_bench/runner.py b/trace_bench/runner.py new file mode 100644 index 0000000..4a8f879 --- /dev/null +++ b/trace_bench/runner.py @@ -0,0 +1,361 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple +import json +import random +import time + +from trace_bench.artifacts import ( + RunArtifacts, + append_event, + append_results_csv, + init_job_dir, + init_run_dir, + write_config_snapshot, + write_env_json, + write_git_json, + write_manifest, + write_job_meta, + write_job_results, + write_summary, +) +from trace_bench.config import RunConfig, TaskConfig, TrainerConfig +from trace_bench.matrix import JobSpec, compute_run_id, expand_matrix +from trace_bench.registry import load_task_bundle +from trace_bench.resolve import merge_kwargs, resolve_trainer_kwargs +from trace_bench.results import RESULT_COLUMNS, build_results_csv_row, build_results_row, summarize_results + + +try: + from opto.trace.nodes import ParameterNode +except Exception: # pragma: no cover - only when opto is not available + ParameterNode = object # type: ignore + + +@dataclass +class RunSummary: + run_id: str + results: List[Dict[str, Any]] + + +def _extract_response(model: Any, input_value: Any) -> Any: + if isinstance(model, ParameterNode): + return getattr(model, "data", model) + if callable(model): + output = model(input_value) + return getattr(output, "data", output) + return getattr(model, "data", model) + + +def _evaluate_bundle(bundle: Dict[str, Any]) -> Dict[str, Any]: + dataset = bundle["train_dataset"] + guide = bundle["guide"] + inputs = dataset.get("inputs") or [] + infos = dataset.get("infos") or [] + if not inputs or not infos: + return {"score": None, "feedback": "empty_dataset"} + task_input = inputs[0] + task_info = infos[0] + response = _extract_response(bundle["param"], task_input) + try: + score, feedback = guide(task_input, response, task_info) + except Exception as exc: + return {"score": None, "feedback": f"eval_error: {exc}"} + return {"score": score, "feedback": feedback} + + +def _resolve_algorithm(name: str): + if name == "PrioritySearch": + return "PrioritySearch" + if name == "GEPA-Base": + from opto.features.gepa.gepa_algorithms import GEPAAlgorithmBase + return GEPAAlgorithmBase + if name == "GEPA-UCB": + from opto.features.gepa.gepa_algorithms import GEPAUCBSearch + return GEPAUCBSearch + if name == "GEPA-Beam": + from opto.features.gepa.gepa_algorithms import GEPABeamPareto + return GEPABeamPareto + return name + + +def _train_bundle(bundle: Dict[str, Any], trainer_spec: TrainerConfig, params: Dict[str, Any], mode: str) -> Dict[str, Any]: + from opto import trainer as opto_trainer + + algo_name = trainer_spec.id + algo = _resolve_algorithm(algo_name) + kwargs = resolve_trainer_kwargs(params, algo_name) + + optimizer = trainer_spec.optimizer + guide = trainer_spec.guide or bundle["guide"] + logger = trainer_spec.logger or "ConsoleLogger" + guide_kwargs = merge_kwargs(bundle.get("guide_kwargs"), trainer_spec.guide_kwargs or {}) + logger_kwargs = merge_kwargs(bundle.get("logger_kwargs"), trainer_spec.logger_kwargs or {}) + + optimizer_kwargs = merge_kwargs(bundle.get("optimizer_kwargs", {}), trainer_spec.optimizer_kwargs or {}) + + if mode == "stub": + try: + from opto.utils.llm import DummyLLM + + def _dummy_response(*_args, **_kwargs): + return '{"suggestion": {}}' + + dummy = DummyLLM(_dummy_response) + if isinstance(optimizer_kwargs, list): + for item in optimizer_kwargs: + item.setdefault("llm", dummy) + elif isinstance(optimizer_kwargs, dict): + optimizer_kwargs.setdefault("llm", dummy) + except Exception: + pass + + try: + opto_trainer.train( + model=bundle["param"], + train_dataset=bundle["train_dataset"], + algorithm=algo, + guide=guide, + optimizer=optimizer, + logger=logger, + optimizer_kwargs=optimizer_kwargs, + guide_kwargs=guide_kwargs, + logger_kwargs=logger_kwargs, + **kwargs, + ) + return {"status": "ok", "optimizer_kwargs": optimizer_kwargs, "trainer_kwargs": kwargs} + except Exception as exc: + return {"status": "failed", "error": str(exc), "optimizer_kwargs": optimizer_kwargs, "trainer_kwargs": kwargs} + + +def _has_trainables(model: Any) -> bool: + if isinstance(model, ParameterNode): + return bool(getattr(model, "trainable", True)) + if hasattr(model, "parameters"): + try: + params = model.parameters() + return any(getattr(p, "trainable", False) for p in params) + except Exception: + return True + return True + + +class BenchRunner: + def __init__(self, config: RunConfig, tasks_root: str | Path = "LLM4AD/benchmark_tasks"): + self.config = config + self.tasks_root = Path(tasks_root) + random.seed(self.config.seeds[0] if self.config.seeds else 123) + self.artifacts: Optional[RunArtifacts] = None + self._bundle_cache: Dict[str, Dict[str, Any]] = {} + + def _bundle_cache_key(self, task: TaskConfig) -> str: + eval_sig = json.dumps(task.eval_kwargs or {}, sort_keys=True) + return f"{task.id}|{eval_sig}" + + def _get_bundle(self, task: TaskConfig) -> Tuple[str, Optional[Dict[str, Any]], Optional[str]]: + key = self._bundle_cache_key(task) + if key in self._bundle_cache: + cached = self._bundle_cache[key] + return cached["status"], cached.get("bundle"), cached.get("error") + try: + bundle = load_task_bundle(task.id, self.tasks_root, eval_kwargs=task.eval_kwargs) + entry = {"status": "ok", "bundle": bundle, "error": None} + except NotImplementedError as exc: + entry = {"status": "skipped", "bundle": None, "error": str(exc)} + except Exception as exc: + entry = {"status": "failed", "bundle": None, "error": f"task_load_error: {exc}"} + self._bundle_cache[key] = entry + return entry["status"], entry.get("bundle"), entry.get("error") + + def run(self) -> RunSummary: + snapshot = self.config.snapshot() + run_id = self.config.run_id or compute_run_id({k: v for k, v in snapshot.items() if k != "run_id"}) + self.config.run_id = run_id + snapshot = self.config.snapshot() + + self.artifacts = init_run_dir(self.config.runs_dir, run_id) + write_config_snapshot(self.artifacts.config_snapshot, snapshot) + write_env_json(self.artifacts.env_json) + write_git_json(self.artifacts.git_json) + + jobs = expand_matrix(self.config) + + results: List[Dict[str, Any]] = [] + manifest_jobs: List[Dict[str, Any]] = [] + for job in jobs: + row, manifest_job = self._run_job(job) + results.append(row) + manifest_jobs.append(manifest_job) + if self.config.fail_fast and row.get("status") == "failed": + break + + recorded_job_ids = {entry["job_id"] for entry in manifest_jobs} + for job in jobs: + if job.job_id in recorded_job_ids: + continue + status_hint, bundle, skip_reason = self._get_bundle(job.task) + resolved_trainer_kwargs = resolve_trainer_kwargs(job.params, job.trainer_id) + resolved_optimizer_kwargs = merge_kwargs( + bundle.get("optimizer_kwargs", {}) if bundle else {}, + job.trainer.optimizer_kwargs or {}, + ) + resolved_guide_kwargs = merge_kwargs( + bundle.get("guide_kwargs") if bundle else {}, + job.trainer.guide_kwargs or {}, + ) + resolved_logger_kwargs = merge_kwargs( + bundle.get("logger_kwargs") if bundle else {}, + job.trainer.logger_kwargs or {}, + ) + manifest_jobs.append( + { + "job_id": job.job_id, + "task_id": job.task_id, + "suite": job.suite, + "trainer_id": job.trainer_id, + "seed": job.seed, + "raw_params": dict(job.params), + "resolved_trainer_kwargs": resolved_trainer_kwargs, + "resolved_optimizer_kwargs": resolved_optimizer_kwargs, + "resolved_guide_kwargs": resolved_guide_kwargs, + "resolved_logger_kwargs": resolved_logger_kwargs, + "eval_kwargs": dict(job.task.eval_kwargs or {}), + "status": "not_executed", + "status_hint": status_hint, + "skip_reason": skip_reason or "fail_fast_stopped", + } + ) + + manifest = { + "run_id": run_id, + "generated_at": datetime.utcnow().isoformat() + "Z", + "jobs": manifest_jobs, + } + write_manifest(self.artifacts.manifest_json, manifest) + + write_summary(self.artifacts.summary_json, summarize_results(results)) + return RunSummary(run_id=run_id, results=results) + + def _run_job(self, job: JobSpec) -> Tuple[Dict[str, Any], Dict[str, Any]]: + assert self.artifacts is not None + job_artifacts = init_job_dir(self.artifacts, job.job_id) + start_time = time.time() + status = "ok" + feedback: Optional[str] = None + + status_hint, bundle, bundle_error = self._get_bundle(job.task) + if status_hint != "ok": + status = status_hint + feedback = bundle_error + + score_initial = None + score_final = None + score_best = None + resolved_trainer_kwargs: Dict[str, Any] = resolve_trainer_kwargs(job.params, job.trainer_id) + resolved_optimizer_kwargs: Dict[str, Any] = dict(job.trainer.optimizer_kwargs or {}) + resolved_guide_kwargs = merge_kwargs({}, job.trainer.guide_kwargs) + resolved_logger_kwargs = merge_kwargs({}, job.trainer.logger_kwargs) + + if bundle is not None and status == "ok": + resolved_optimizer_kwargs = merge_kwargs( + bundle.get("optimizer_kwargs", {}), job.trainer.optimizer_kwargs or {} + ) + resolved_guide_kwargs = merge_kwargs( + bundle.get("guide_kwargs"), + job.trainer.guide_kwargs, + ) + resolved_logger_kwargs = merge_kwargs( + bundle.get("logger_kwargs"), + job.trainer.logger_kwargs, + ) + if not _has_trainables(bundle["param"]): + status = "failed" + feedback = "no_trainable_parameters" + else: + initial = _evaluate_bundle(bundle) + score_initial = initial.get("score") + train_result = _train_bundle(bundle, job.trainer, job.params, self.config.mode) + status = train_result.get("status", "ok") + resolved_optimizer_kwargs = train_result.get("optimizer_kwargs") or resolved_optimizer_kwargs + resolved_trainer_kwargs = train_result.get("trainer_kwargs") or resolved_trainer_kwargs + if status == "failed": + feedback = f"training_error: {train_result.get('error', 'unknown')}" + final = _evaluate_bundle(bundle) + score_final = final.get("score") + if status != "failed": + feedback = final.get("feedback") or feedback + + if isinstance(score_initial, (int, float)) and isinstance(score_final, (int, float)): + score_best = max(score_initial, score_final) + else: + score_best = score_final if score_final is not None else score_initial + + elapsed = time.time() - start_time + tb_rel = str(Path("jobs") / job.job_id / "tb") + row = build_results_row( + run_id=self.config.run_id or "", + job_id=job.job_id, + task_id=job.task_id, + suite=job.suite, + trainer_id=job.trainer_id, + seed=job.seed, + status=status, + score_initial=score_initial, + score_final=score_final, + score_best=score_best, + time_seconds=elapsed, + resolved_trainer_kwargs=resolved_trainer_kwargs, + resolved_optimizer_kwargs=resolved_optimizer_kwargs, + eval_kwargs=job.task.eval_kwargs, + feedback=feedback, + tb_logdir=tb_rel, + ) + job_meta = { + "job_id": job.job_id, + "task_id": job.task_id, + "suite": job.suite, + "trainer_id": job.trainer_id, + "seed": job.seed, + "status": status, + "raw_params": dict(job.params), + "params": job.params, + "resolved_trainer_kwargs": resolved_trainer_kwargs, + "resolved_optimizer_kwargs": resolved_optimizer_kwargs, + "resolved_guide_kwargs": resolved_guide_kwargs, + "resolved_logger_kwargs": resolved_logger_kwargs, + "optimizer": job.trainer.optimizer, + "optimizer_kwargs": job.trainer.optimizer_kwargs, + "guide": job.trainer.guide, + "guide_kwargs": job.trainer.guide_kwargs, + "logger": job.trainer.logger, + "logger_kwargs": job.trainer.logger_kwargs, + "eval_kwargs": job.task.eval_kwargs, + "feedback": feedback or "", + "tb_logdir": tb_rel, + } + write_job_meta(job_artifacts.job_meta, job_meta) + append_results_csv(self.artifacts.results_csv, RESULT_COLUMNS, build_results_csv_row(row)) + append_event(job_artifacts.events_jsonl, row) + write_job_results(job_artifacts.results_json, row) + manifest_job = { + "job_id": job.job_id, + "task_id": job.task_id, + "suite": job.suite, + "trainer_id": job.trainer_id, + "seed": job.seed, + "raw_params": dict(job.params), + "resolved_trainer_kwargs": resolved_trainer_kwargs, + "resolved_optimizer_kwargs": resolved_optimizer_kwargs, + "resolved_guide_kwargs": resolved_guide_kwargs, + "resolved_logger_kwargs": resolved_logger_kwargs, + "eval_kwargs": dict(job.task.eval_kwargs or {}), + "status": status, + "feedback": feedback or "", + } + return row, manifest_job + + +__all__ = ["BenchRunner", "RunSummary"] diff --git a/trace_bench/tasks.py b/trace_bench/tasks.py new file mode 100644 index 0000000..4013d2f --- /dev/null +++ b/trace_bench/tasks.py @@ -0,0 +1,5 @@ +ο»Ώ"""Backward-compatible task helpers. Use trace_bench.registry instead.""" + +from .registry import discover_tasks, load_task_bundle, load_task_module, TaskSpec + +__all__ = ["discover_tasks", "load_task_bundle", "load_task_module", "TaskSpec"] diff --git a/trace_bench/ui.py b/trace_bench/ui.py new file mode 100644 index 0000000..f2090e6 --- /dev/null +++ b/trace_bench/ui.py @@ -0,0 +1,60 @@ +ο»Ώfrom __future__ import annotations + +from pathlib import Path +import csv +import json + + +def _read_text(path: Path) -> str: + try: + return path.read_text(encoding="utf-8") + except Exception: + return "" + + +def _read_csv(path: Path): + if not path.exists(): + return [] + with path.open("r", encoding="utf-8") as f: + reader = csv.DictReader(f) + return list(reader) + + +def launch_ui(runs_dir: str) -> int: + try: + import gradio as gr + except Exception: + print("Gradio is not installed. Install with: pip install gradio") + return 1 + + runs_root = Path(runs_dir) + runs = sorted([p.name for p in runs_root.iterdir() if p.is_dir()]) if runs_root.exists() else [] + + def load_run(run_id: str): + run_path = runs_root / run_id + config_text = _read_text(run_path / "meta" / "config.snapshot.yaml") + results = _read_csv(run_path / "results.csv") + env_text = _read_text(run_path / "meta" / "env.json") + return config_text, results, env_text + + with gr.Blocks() as demo: + gr.Markdown("# Trace-Bench UI (Stub)") + gr.Markdown("Select a run to view config, results, and env info.") + run_selector = gr.Dropdown(choices=runs, label="Run ID") + config_box = gr.Code(label="config.snapshot.yaml", language="yaml") + results_df = gr.Dataframe(label="results.csv") + env_box = gr.Code(label="env.json", language="json") + + run_selector.change(load_run, inputs=run_selector, outputs=[config_box, results_df, env_box]) + + try: + import mlflow # noqa: F401 + gr.Markdown("MLflow detected. Full integration is pending (M3).") + except Exception: + gr.Markdown("MLflow not installed. Install if you want UI-linked runs.") + + demo.launch() + return 0 + + +__all__ = ["launch_ui"]