From 96dbf94036387226b7dc8d1f30e76ecc5f2f8942 Mon Sep 17 00:00:00 2001 From: Asad Date: Tue, 10 Feb 2026 11:05:44 +0500 Subject: [PATCH 1/8] M1: Runner API, canonical artifacts, CLI, and notebook Implements the M1 milestone for Trace-Bench: CLI surface: - trace-bench list-tasks, list-trainers, validate --config --strict, run, ui - Strict validation: trainer kwarg checking, optimizer/guide/logger resolution, trainable parameter detection, matrix expansion with manifest output Runner & training: - BenchRunner with deterministic SHA256-based job IDs - Algorithm-aware kwarg mapping (PrioritySearch vs GEPA-Base/UCB/Beam) - DummyLLM stub mode for offline testing - Training error capture in feedback field Canonical artifact layout: - meta/config.snapshot.yaml, manifest.json, env.json (redacted), git.json - Per-job: job_meta.json, results.json, events.jsonl, artifacts/, tb/ - Run-level: results.csv (16 columns) + summary.json Task coverage: - 4 internal types (code_param, numeric_param, multi_param, non_trainable) - trace_examples:greeting_stub - llm4ad:circle_packing (bounded timeout) - veribench:smoke_placeholder (NotImplementedError stub) Trainer coverage: - PrioritySearch + GEPA-Base exercised in real mode - GEPA-UCB + GEPA-Beam configured (M4 scope) Tests: 30 pass, 2 skipped (m0 smoke, m1 artifacts, matrix e2e, internal tasks, opentrace examples, trainer config, veribench CLI) Notebook: 01_m1_minimal_api.ipynb with Colab badge, auto-detect API key (real/stub mode), 2x2 matrix smoke (4/4 ok), executed outputs committed. --- .gitignore | 6 +- README.md | 68 +- configs/m1_matrix_smoke.yaml | 24 + configs/m1_validation.yaml | 55 + configs/smoke.yaml | 12 + configs/smoke_real.yaml | 12 + notebooks/01_m1_minimal_api.ipynb | 1544 +++++++++++++++++ notebooks/01_smoke_runner.ipynb | 213 +++ pytest.ini | 4 + setup.py | 14 +- tests/m0/test_config.py | 8 + tests/m0/test_runner_smoke.py | 38 + tests/m0/test_stub_llm.py | 25 + tests/m1/test_artifacts_layout.py | 28 + tests/m1/test_internal_tasks.py | 23 + tests/m1/test_matrix.py | 51 + tests/m1/test_opentrace_examples_smoke.py | 88 + tests/m1/test_trainer_config.py | 22 + tests/m1/test_veribench_cli.py | 17 + tests/test_lite_optimize_llm4ad.py | 3 + trace_bench/__init__.py | 6 + trace_bench/__main__.py | 4 + trace_bench/artifacts.py | 212 +++ trace_bench/cli.py | 231 +++ trace_bench/config.py | 228 +++ trace_bench/examples/__init__.py | 1 + trace_bench/examples/greeting_stub.py | 49 + trace_bench/examples/internal_code_param.py | 41 + trace_bench/examples/internal_multi_param.py | 45 + .../examples/internal_non_trainable.py | 41 + .../examples/internal_numeric_param.py | 44 + .../examples/train_single_node_stub.py | 50 + trace_bench/matrix.py | 101 ++ trace_bench/registry.py | 217 +++ trace_bench/results.py | 82 + trace_bench/runner.py | 334 ++++ trace_bench/tasks.py | 5 + trace_bench/ui.py | 60 + 38 files changed, 3999 insertions(+), 7 deletions(-) create mode 100644 configs/m1_matrix_smoke.yaml create mode 100644 configs/m1_validation.yaml create mode 100644 configs/smoke.yaml create mode 100644 configs/smoke_real.yaml create mode 100644 notebooks/01_m1_minimal_api.ipynb create mode 100644 notebooks/01_smoke_runner.ipynb create mode 100644 pytest.ini create mode 100644 tests/m0/test_config.py create mode 100644 tests/m0/test_runner_smoke.py create mode 100644 tests/m0/test_stub_llm.py create mode 100644 tests/m1/test_artifacts_layout.py create mode 100644 tests/m1/test_internal_tasks.py create mode 100644 tests/m1/test_matrix.py create mode 100644 tests/m1/test_opentrace_examples_smoke.py create mode 100644 tests/m1/test_trainer_config.py create mode 100644 tests/m1/test_veribench_cli.py create mode 100644 trace_bench/__init__.py create mode 100644 trace_bench/__main__.py create mode 100644 trace_bench/artifacts.py create mode 100644 trace_bench/cli.py create mode 100644 trace_bench/config.py create mode 100644 trace_bench/examples/__init__.py create mode 100644 trace_bench/examples/greeting_stub.py create mode 100644 trace_bench/examples/internal_code_param.py create mode 100644 trace_bench/examples/internal_multi_param.py create mode 100644 trace_bench/examples/internal_non_trainable.py create mode 100644 trace_bench/examples/internal_numeric_param.py create mode 100644 trace_bench/examples/train_single_node_stub.py create mode 100644 trace_bench/matrix.py create mode 100644 trace_bench/registry.py create mode 100644 trace_bench/results.py create mode 100644 trace_bench/runner.py create mode 100644 trace_bench/tasks.py create mode 100644 trace_bench/ui.py diff --git a/.gitignore b/.gitignore index 83e4e75..074e707 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,8 @@ __pycache__/ external/* **/uv.lock *.egg-info/ -**/.venv/ \ No newline at end of file +**/.venv/ +.env +runs/ +runs_test/ +notebooks/01_smoke_runner_with_output.ipynb diff --git a/README.md b/README.md index 3423365..c49779f 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,70 @@ Currently, we are adding problems/domains one folder at a time. The instructions to run each task are located inside the task folder. +## Quick Start (Runner/CLI) + +```bash +# M1 review checklist (recommended order) +# 1) List tasks (LLM4AD + example stubs) +trace-bench list-tasks --root LLM4AD/benchmark_tasks + +# 2) Validate a config +trace-bench validate --config configs/smoke.yaml + +# 3) Run Stub smoke (deterministic, no keys) +trace-bench run --config configs/smoke.yaml --runs-dir runs + +# 4) Run Real smoke (requires OPENAI_API_KEY) +trace-bench run --config configs/smoke_real.yaml --runs-dir runs + +# 5) Run tests (disable external plugin autoload) +PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 pytest -q + +# List tasks (LLM4AD + example stubs) +trace-bench list-tasks --root LLM4AD/benchmark_tasks + +# Validate a config +trace-bench validate --config configs/smoke.yaml + +# Run a smoke benchmark +trace-bench run --config configs/smoke.yaml + +# Launch UI (stub) +trace-bench ui --runs-dir runs +``` + +Expected run artifacts: +- `runs//config.snapshot.yaml` +- `runs//env.json` +- `runs//results.csv` +- `runs//events.jsonl` +- `runs//summary.json` +- `runs//tb/` + +## M1 Dependencies (Required for Full Pass) + +System: +- Graphviz (system package) + +Python: +- `graphviz`, `pyyaml`, `pytest`, `numpy`, `matplotlib`, `litellm==1.75.0` + +OpenTrace examples strict smoke (for 100% pass): +- `datasets`, `textgrad`, `dspy`, `autogen`, `python-dotenv` + +## OpenTrace Examples Smoke (100% Pass Mode) + +To enforce 100% example smoke in CI, run: +```bash +TRACE_BENCH_STRICT_EXAMPLES=1 PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 pytest -q +``` +Without strict mode, the smoke test skips only when optional deps are missing. + +## VeriBench Status (In Scope, Pending Input) + +VeriBench is in scope but requires the Trace team to provide the task entrypoint/task list. +CLI flags are ready (`--bench veribench`), and will raise a clear `NotImplementedError` until the entrypoint is provided. + ## Problem Sets ### General Problem Sets @@ -27,9 +91,9 @@ Current implementation of graph is a single node. **Supported Algorithms:** PrioritySearch, GEPA-Base, GEPA-UCB, GEPA-Beam -📖 **[See detailed usage guide →](LM4AD/readme.md)** +**See detailed usage guide:** `LM4AD/readme.md` ## Agent Architecture - ReAct agent -All the libraries from other repos are stored and managed in the `external` folder -- this folder will be created if one of the `install.sh` script is run inside the task folder. \ No newline at end of file +All the libraries from other repos are stored and managed in the `external` folder -- this folder will be created if one of the `install.sh` script is run inside the task folder. diff --git a/configs/m1_matrix_smoke.yaml b/configs/m1_matrix_smoke.yaml new file mode 100644 index 0000000..3ba1b6e --- /dev/null +++ b/configs/m1_matrix_smoke.yaml @@ -0,0 +1,24 @@ +runs_dir: runs +mode: stub +seeds: [123] +max_workers: 1 +fail_fast: false + +tasks: + - id: internal:numeric_param + - id: llm4ad:circle_packing + eval_kwargs: + timeout_seconds: 10 + +trainers: + - id: PrioritySearch + params_variants: + - ps_steps: 1 + ps_batches: 1 + + - id: GEPA-Base + params_variants: + - gepa_iters: 1 + gepa_train_bs: 2 + gepa_merge_every: 2 + gepa_pareto_subset: 2 diff --git a/configs/m1_validation.yaml b/configs/m1_validation.yaml new file mode 100644 index 0000000..fdbe511 --- /dev/null +++ b/configs/m1_validation.yaml @@ -0,0 +1,55 @@ +runs_dir: runs +mode: stub +seeds: [123] +max_workers: 1 +fail_fast: false + +tasks: + - id: internal:code_param + - id: internal:numeric_param + - id: internal:multi_param + - id: internal:non_trainable + - id: trace_examples:greeting_stub + - id: llm4ad:circle_packing + eval_kwargs: + timeout_seconds: 10 + - id: veribench:smoke_placeholder + +trainers: + - id: PrioritySearch + params_variants: + - threads: 2 + ps_steps: 1 + ps_batches: 1 + ps_candidates: 2 + ps_proposals: 2 + ps_mem_update: 1 + + - id: GEPA-Base + params_variants: + - threads: 2 + gepa_iters: 1 + gepa_train_bs: 2 + gepa_merge_every: 2 + gepa_pareto_subset: 2 + optimizer: OPROv2 + optimizer_kwargs: {} + + - id: GEPA-UCB + params_variants: + - threads: 2 + gepa_iters: 1 + gepa_train_bs: 2 + gepa_merge_every: 2 + gepa_pareto_subset: 2 + + - id: GEPA-Beam + params_variants: + - threads: 2 + gepa_iters: 1 + gepa_train_bs: 2 + gepa_merge_every: 2 + gepa_pareto_subset: 2 + +eval_kwargs: + timeout_seconds: 10 diff --git a/configs/smoke.yaml b/configs/smoke.yaml new file mode 100644 index 0000000..8455c9f --- /dev/null +++ b/configs/smoke.yaml @@ -0,0 +1,12 @@ +runs_dir: runs +mode: stub +seeds: [123] + +tasks: + - id: internal:numeric_param + +trainers: + - id: PrioritySearch + params_variants: + - ps_steps: 1 + ps_batches: 1 diff --git a/configs/smoke_real.yaml b/configs/smoke_real.yaml new file mode 100644 index 0000000..2ebb27d --- /dev/null +++ b/configs/smoke_real.yaml @@ -0,0 +1,12 @@ +runs_dir: runs +mode: real +seeds: [123] + +tasks: + - id: trace_examples:greeting_stub + +trainers: + - id: PrioritySearch + params_variants: + - ps_steps: 1 + ps_batches: 1 diff --git a/notebooks/01_m1_minimal_api.ipynb b/notebooks/01_m1_minimal_api.ipynb new file mode 100644 index 0000000..888d492 --- /dev/null +++ b/notebooks/01_m1_minimal_api.ipynb @@ -0,0 +1,1544 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "euYNX4m-m0Ty" + }, + "source": [ + "# Trace-Bench M1 — Minimal API Validation\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guru-code-expert/Trace-Bench/blob/runner-foundation/notebooks/01_m1_minimal_api.ipynb)\n", + "\n", + "This notebook validates the **M1 contracts**: canonical artifacts, deterministic IDs, and minimal runnable coverage across benches.\n", + "\n", + "**Mode policy**: defaults to **real** (uses API key if present). If no key is found, falls back to **stub** with a clear warning and STUB label on outputs." + ], + "id": "euYNX4m-m0Ty" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u5DVjcAAm0UH" + }, + "source": [ + "## Expected Outputs\n", + "\n", + "- A new `runs//` folder with `meta/` + `jobs/` layout.\n", + "- `meta/config.snapshot.yaml`, `meta/manifest.json`, `meta/env.json` exist.\n", + "- `results.csv` contains `status` values (`ok`/`failed`/`skipped`).\n", + "- Internal non-trainable job shows `status=failed` with reason.\n", + "- If running in **real** mode, artifacts show `mode=real` and LLM4AD task produces a score.\n", + "- **2x2 matrix smoke**: `results.csv` with exactly 4 rows from 2 tasks x 2 trainers x 1 seed." + ], + "id": "u5DVjcAAm0UH" + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "8D3DGyVXm0UJ", + "outputId": "aadad0ba-037c-4ffc-8d5a-4c55fb9d0d3f", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n", + "Runs dir: /content/drive/MyDrive/bench/2026-02-09/trace_bench\n", + "API key found — running in REAL mode (model: gpt-4o-mini)\n", + "\n", + "Mode: real\n" + ] + } + ], + "source": [ + "# Mount Drive (optional) + compute persistent runs_dir + detect API key\n", + "from datetime import date\n", + "from pathlib import Path\n", + "import os\n", + "\n", + "try:\n", + " from google.colab import drive\n", + " drive.mount(\"/content/drive\")\n", + "except Exception:\n", + " pass\n", + "\n", + "\n", + "def bench_dir(project=\"bench\", sub=\"trace_bench\", local=\"/content/bench\"):\n", + " drive_root = Path(\"/content/drive/MyDrive\")\n", + " root = drive_root if drive_root.is_dir() else Path(local)\n", + " out = root / project / date.today().isoformat() / sub\n", + " out.mkdir(parents=True, exist_ok=True)\n", + " return str(out)\n", + "\n", + "RUNS_DIR = bench_dir()\n", + "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n", + "print(\"Runs dir:\", RUNS_DIR)\n", + "\n", + "# --- Auto-detect API key (real mode by default) ---\n", + "API_KEY = os.environ.get(\"OPENAI_API_KEY\", \"\")\n", + "if not API_KEY:\n", + " try:\n", + " from google.colab import userdata\n", + " API_KEY = userdata.get(\"OPENAI_API_KEY\") or \"\"\n", + " except Exception:\n", + " pass\n", + "\n", + "if API_KEY:\n", + " os.environ[\"OPENAI_API_KEY\"] = API_KEY\n", + " os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n", + " os.environ[\"TRACE_LITELLM_MODEL\"] = \"gpt-4o-mini\"\n", + " MODE = \"real\"\n", + " print(f\"API key found — running in REAL mode (model: gpt-4o-mini)\")\n", + "else:\n", + " MODE = \"stub\"\n", + " print(\"WARNING: No OPENAI_API_KEY found. Falling back to STUB mode.\")\n", + " print(\" All outputs below are labeled STUB — not real LLM results.\")\n", + "\n", + "os.environ[\"TB_MODE\"] = MODE\n", + "print(f\"\\nMode: {MODE}\")" + ], + "id": "8D3DGyVXm0UJ" + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "swOi3Bhtm0UQ", + "outputId": "e9806308-35f8-48c5-e6b2-e5f46530a497", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'Trace-Bench'...\n", + "remote: Enumerating objects: 315, done.\u001b[K\n", + "remote: Counting objects: 100% (315/315), done.\u001b[K\n", + "remote: Compressing objects: 100% (222/222), done.\u001b[K\n", + "remote: Total 315 (delta 42), reused 274 (delta 36), pack-reused 0 (from 0)\u001b[K\n", + "Receiving objects: 100% (315/315), 3.86 MiB | 8.12 MiB/s, done.\n", + "Resolving deltas: 100% (42/42), done.\n", + "Cloning into 'OpenTrace'...\n", + "remote: Enumerating objects: 228, done.\u001b[K\n", + "remote: Counting objects: 100% (228/228), done.\u001b[K\n", + "remote: Compressing objects: 100% (205/205), done.\u001b[K\n", + "remote: Total 228 (delta 17), reused 114 (delta 13), pack-reused 0 (from 0)\u001b[K\n", + "Receiving objects: 100% (228/228), 4.73 MiB | 14.77 MiB/s, done.\n", + "Resolving deltas: 100% (17/17), done.\n", + "/content/Trace-Bench\n", + "Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n", + "Get:2 https://cli.github.com/packages stable InRelease [3,917 B]\n", + "Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [85.0 kB]\n", + "Get:4 https://cli.github.com/packages stable/main amd64 Packages [356 B]\n", + "Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease\n", + "Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n", + "Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n", + "Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n", + "Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,893 kB]\n", + "Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]\n", + "Get:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]\n", + "Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n", + "Get:13 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [6,396 kB]\n", + "Get:14 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [38.8 kB]\n", + "Get:15 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [6,661 kB]\n", + "Get:16 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy/main amd64 Packages [75.3 kB]\n", + "Get:17 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,716 kB]\n", + "Get:18 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,683 kB]\n", + "Get:19 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [62.6 kB]\n", + "Get:20 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,297 kB]\n", + "Get:21 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [70.9 kB]\n", + "Get:22 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [4,035 kB]\n", + "Get:23 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,609 kB]\n", + "Fetched 37.1 MB in 6s (6,435 kB/s)\n", + "Reading package lists... Done\n", + "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n", + "Reading package lists... Done\n", + "Building dependency tree... Done\n", + "Reading state information... Done\n", + "graphviz is already the newest version (2.42.2-6ubuntu0.1).\n", + "0 upgraded, 0 newly installed, 0 to remove and 55 not upgraded.\n", + "Requirement already satisfied: pip in /usr/local/lib/python3.12/dist-packages (24.1.2)\n", + "Collecting pip\n", + " Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)\n", + "Downloading pip-26.0.1-py3-none-any.whl (1.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: pip\n", + " Attempting uninstall: pip\n", + " Found existing installation: pip 24.1.2\n", + " Uninstalling pip-24.1.2:\n", + " Successfully uninstalled pip-24.1.2\n", + "Successfully installed pip-26.0.1\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (6.0.3)\n", + "Requirement already satisfied: pytest in /usr/local/lib/python3.12/dist-packages (8.4.2)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", + "Requirement already satisfied: graphviz in /usr/local/lib/python3.12/dist-packages (0.21)\n", + "Collecting litellm==1.75.0\n", + " Downloading litellm-1.75.0-py3-none-any.whl.metadata (40 kB)\n", + "Requirement already satisfied: aiohttp>=3.10 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.13.3)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.3.1)\n", + "Requirement already satisfied: httpx>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.28.1)\n", + "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.7.1)\n", + "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.1.6)\n", + "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (4.26.0)\n", + "Requirement already satisfied: openai>=1.68.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.16.0)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.12.3)\n", + "Requirement already satisfied: python-dotenv>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (1.2.1)\n", + "Requirement already satisfied: tiktoken>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.12.0)\n", + "Requirement already satisfied: tokenizers in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.22.2)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2<4.0.0,>=3.1.2->litellm==1.75.0) (3.0.3)\n", + "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (25.4.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (2025.9.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.37.0)\n", + "Requirement already satisfied: rpds-py>=0.25.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.30.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (2.41.4)\n", + "Requirement already satisfied: typing-extensions>=4.14.1 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (4.15.0)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.4.2)\n", + "Requirement already satisfied: iniconfig>=1 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.3.0)\n", + "Requirement already satisfied: packaging>=20 in /usr/local/lib/python3.12/dist-packages (from pytest) (26.0)\n", + "Requirement already satisfied: pluggy<2,>=1.5 in /usr/local/lib/python3.12/dist-packages (from pytest) (1.6.0)\n", + "Requirement already satisfied: pygments>=2.7.2 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.19.2)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", + "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (2.9.0.post0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.4.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.8.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (6.7.1)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (0.4.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.22.0)\n", + "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.12/dist-packages (from yarl<2.0,>=1.17.0->aiohttp>=3.10->litellm==1.75.0) (3.11)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (4.12.1)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (2026.1.4)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (1.0.9)\n", + "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx>=0.23.0->litellm==1.75.0) (0.16.0)\n", + "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.12/dist-packages (from importlib-metadata>=6.8.0->litellm==1.75.0) (3.23.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.9.0)\n", + "Requirement already satisfied: jiter<1,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (0.13.0)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.3.1)\n", + "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (4.67.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n", + "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2025.11.3)\n", + "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2.32.4)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (3.4.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (2.5.0)\n", + "Requirement already satisfied: huggingface-hub<2.0,>=0.16.4 in /usr/local/lib/python3.12/dist-packages (from tokenizers->litellm==1.75.0) (1.3.7)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (3.20.3)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (2025.3.0)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.2.0)\n", + "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.5.4)\n", + "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (0.21.1)\n", + "Downloading litellm-1.75.0-py3-none-any.whl (8.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m81.9 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: litellm\n", + "Successfully installed litellm-1.75.0\n" + ] + } + ], + "source": [ + "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n", + "!git clone --depth 1 --branch runner-foundation https://github.com/guru-code-expert/Trace-Bench.git\n", + "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n", + "\n", + "%cd Trace-Bench\n", + "\n", + "# System + Python deps\n", + "!apt-get update -y && apt-get install -y graphviz\n", + "!python -m pip install -U pip\n", + "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0" + ], + "id": "swOi3Bhtm0UQ" + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "a__iRJTHm0UR", + "outputId": "f48aba86-b779-4537-f5ce-8d5b2bdc4154", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "=== List trainers ===\n", + "PrioritySearch\tavailable\n", + "GEPA-Base\tavailable\n", + "GEPA-UCB\tavailable\n", + "GEPA-Beam\tavailable\n", + "\n", + "=== Validate config (strict) ===\n", + "[OK] internal:code_param\n", + "[OK] internal:numeric_param\n", + "[OK] internal:multi_param\n", + "[OK] internal:non_trainable\n", + "[EXPECTED] internal:non_trainable: no_trainable_parameters\n", + "[OK] trace_examples:greeting_stub\n", + "[OK] llm4ad:circle_packing\n", + "[SKIP] veribench:smoke_placeholder: VeriBench tasks not yet wired: awaiting Trace team entrypoint/task list.\n", + "\n", + "[OK] matrix: 28 jobs expanded deterministically\n", + " job 6f3619dd9ae0: internal:code_param x PrioritySearch (seed=123)\n", + " job c486ba93400f: internal:code_param x GEPA-Base (seed=123)\n", + " job a84d2486d31a: internal:code_param x GEPA-UCB (seed=123)\n", + " job 8ecff95cfafa: internal:code_param x GEPA-Beam (seed=123)\n", + " job 778da61d2682: internal:numeric_param x PrioritySearch (seed=123)\n", + " job 4b3a7f322126: internal:numeric_param x GEPA-Base (seed=123)\n", + " job 4b9c7d66d866: internal:numeric_param x GEPA-UCB (seed=123)\n", + " job 54df742bb5e9: internal:numeric_param x GEPA-Beam (seed=123)\n", + " job 0bfef35f6ef3: internal:multi_param x PrioritySearch (seed=123)\n", + " job e06adbe6489b: internal:multi_param x GEPA-Base (seed=123)\n", + " job 8669d9b963d4: internal:multi_param x GEPA-UCB (seed=123)\n", + " job 90d23f88baf7: internal:multi_param x GEPA-Beam (seed=123)\n", + " job d6aa82e5d119: internal:non_trainable x PrioritySearch (seed=123)\n", + " job 4f655637a6dc: internal:non_trainable x GEPA-Base (seed=123)\n", + " job 85940a1b71e7: internal:non_trainable x GEPA-UCB (seed=123)\n", + " job dafcec9c13af: internal:non_trainable x GEPA-Beam (seed=123)\n", + " job e8e9938a4ef6: trace_examples:greeting_stub x PrioritySearch (seed=123)\n", + " job 4715e211f8a9: trace_examples:greeting_stub x GEPA-Base (seed=123)\n", + " job 8c4ec9f3e355: trace_examples:greeting_stub x GEPA-UCB (seed=123)\n", + " job 2f84751a35ad: trace_examples:greeting_stub x GEPA-Beam (seed=123)\n", + " job da0e8ae694f1: llm4ad:circle_packing x PrioritySearch (seed=123)\n", + " job 0865599891de: llm4ad:circle_packing x GEPA-Base (seed=123)\n", + " job d25dcdb59892: llm4ad:circle_packing x GEPA-UCB (seed=123)\n", + " job d985faad90f4: llm4ad:circle_packing x GEPA-Beam (seed=123)\n", + " job 364d89b28934: veribench:smoke_placeholder x PrioritySearch (seed=123)\n", + " job 721282ed015b: veribench:smoke_placeholder x GEPA-Base (seed=123)\n", + " job 5b657b995d7a: veribench:smoke_placeholder x GEPA-UCB (seed=123)\n", + " job 77b3e4cb5bf0: veribench:smoke_placeholder x GEPA-Beam (seed=123)\n", + "\n", + " tasks: ['internal:code_param', 'internal:multi_param', 'internal:non_trainable', 'internal:numeric_param', 'llm4ad:circle_packing', 'trace_examples:greeting_stub', 'veribench:smoke_placeholder']\n", + " trainers: ['GEPA-Base', 'GEPA-Beam', 'GEPA-UCB', 'PrioritySearch']\n", + "[OK] manifest written: runs/20260209-153344-8f7a72b4/meta/manifest.json\n", + "\n", + "=== Generate M1 run config (mode=real) ===\n", + "Config mode: real\n", + "\n", + "=== Run M1 validation ===\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: 1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: 1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n", + " return code\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 1\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 2\n", + "[Step 1] Update/best_candidate_priority: 1.0\n", + "[Step 1] Update/best_candidate_mean_score: 1.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 1\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 1.0\n", + "[Step 1] Sample/num_samples: 1\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 3\u001b[0m\n", + "[Step 1] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n", + " return code\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -3.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -3.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 3\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:2: 1.0\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:3: 1.0\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", + " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -0.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:2: 1.5\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:3: 1.5\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", + " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1000000.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1000000.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code:3: import numpy as np\n", + "import math\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " grid_size = int(np.ceil(np.sqrt(n)))\n", + " radius = 0.5 / grid_size\n", + "\n", + " circles = []\n", + " for i in range(n):\n", + " row = i // grid_size\n", + " col = i % grid_size\n", + " x = (col + 0.5) / grid_size\n", + " y = (row + 0.5) / grid_size\n", + " circles.append([x, y, radius])\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.375582371483138\n", + "[Step 1] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 1.375582371483138\n", + "[Step 1] Update/best_candidate_mean_score: 1.375582371483138\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.0407921408122753\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.0407921408122753\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: -1000000.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code:3: import numpy as np\n", + "import random\n", + "\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " random.seed(2025)\n", + " np.random.seed(2025)\n", + "\n", + " circles = []\n", + " radii = np.random.uniform(0.01, 0.1, size=n) # Random radii between 0.01 and 0.1\n", + "\n", + " for _ in range(n):\n", + " placed = False\n", + " while not placed:\n", + " radius = np.random.choice(radii)\n", + " x = np.random.uniform(radius, 1 - radius)\n", + " y = np.random.uniform(radius, 1 - radius)\n", + " overlap = False\n", + " \n", + " # Check for overlap\n", + " for circle in circles:\n", + " if np.sqrt((circle[0] - x) ** 2 + (circle[1] - y) ** 2) < (circle[2] + radius):\n", + " overlap = True\n", + " break\n", + " \n", + " if not overlap:\n", + " circles.append([x, y, radius])\n", + " placed = True\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: -1000000.0\u001b[0m\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_run.yaml <\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
run_idjob_idtask_idsuitetrainer_idseedstatusscore_initialscore_finalscore_besttime_secondsresolved_trainer_kwargsresolved_optimizer_kwargseval_kwargsfeedbacktb_logdir
020260209-153346-0daa4bb96f3619dd9ae0internal:code_paraminternalPrioritySearch123ok1.01.01.010.507114{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/6f3619dd9ae0/tb
120260209-153346-0daa4bb9c486ba93400finternal:code_paraminternalGEPA-Base123ok1.01.01.01.279633{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/c486ba93400f/tb
220260209-153346-0daa4bb9778da61d2682internal:numeric_paraminternalPrioritySearch123ok-3.0-0.0-0.04.215786{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/778da61d2682/tb
320260209-153346-0daa4bb94b3a7f322126internal:numeric_paraminternalGEPA-Base123ok-3.0-0.0-0.03.031100{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/4b3a7f322126/tb
420260209-153346-0daa4bb90bfef35f6ef3internal:multi_paraminternalPrioritySearch123ok-1.0-0.0-0.03.620341{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Make a+b matc...{\"timeout_seconds\": 10}target=3.0jobs/0bfef35f6ef3/tb
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 12,\n \"fields\": [\n {\n \"column\": \"run_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"20260209-153346-0daa4bb9\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"job_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"364d89b28934\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"internal:code_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_initial\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 462909.5869786947,\n \"min\": -1000000.0,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_final\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 353553.5610863874,\n \"min\": -1000000.0,\n \"max\": 1.375582371483138,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 353553.5610863874,\n \"min\": -1000000.0,\n \"max\": 1.375582371483138,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"time_seconds\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8.86582048810776,\n \"min\": 3.5e-05,\n \"max\": 28.849823,\n \"num_unique_values\": 12,\n \"samples\": [\n 4.2e-05\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_trainer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"{\\\"merge_every\\\": 2, \\\"num_iters\\\": 1, \\\"pareto_subset_size\\\": 2, \\\"train_batch_size\\\": 2}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_optimizer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"{\\\"memory_size\\\": 5, \\\"objective\\\": \\\"Match the numeric target value.\\\"}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"eval_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"{\\\"timeout_seconds\\\": 10}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"feedback\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Correct\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tb_logdir\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"jobs/364d89b28934/tb\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "# Inspect latest run artifacts\n", + "import pathlib, json, pandas as pd\n", + "\n", + "runs_root = pathlib.Path(RUNS_DIR)\n", + "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", + "\n", + "run_dir = None\n", + "for p in reversed(candidates):\n", + " if (p / \"meta\" / \"config.snapshot.yaml\").exists():\n", + " run_dir = p\n", + " break\n", + "\n", + "if run_dir is None:\n", + " for p in reversed(candidates):\n", + " if (p / \"config.snapshot.yaml\").exists():\n", + " run_dir = p\n", + " break\n", + "\n", + "if run_dir is None:\n", + " raise FileNotFoundError(\"No run folder with config snapshot found under RUNS_DIR\")\n", + "\n", + "print(\"Run dir:\", run_dir)\n", + "\n", + "config_path = run_dir / \"meta\" / \"config.snapshot.yaml\"\n", + "env_path = run_dir / \"meta\" / \"env.json\"\n", + "manifest_path = run_dir / \"meta\" / \"manifest.json\"\n", + "\n", + "if not config_path.exists():\n", + " config_path = run_dir / \"config.snapshot.yaml\"\n", + " env_path = run_dir / \"env.json\"\n", + "\n", + "config_text = config_path.read_text()\n", + "print(config_text[:400])\n", + "\n", + "if manifest_path.exists():\n", + " manifest = json.loads(manifest_path.read_text())\n", + " print(\"Jobs in manifest:\", len(manifest.get(\"jobs\", [])))\n", + "\n", + "df = pd.read_csv(run_dir / \"results.csv\")\n", + "df.head()\n" + ], + "id": "ckY1HmQam0UU" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gpkb4-1Em0UW" + }, + "source": [ + "## 2x2 Bounded Matrix Smoke (Plan A+ Pareto)\n", + "\n", + "Run exactly **2 tasks x 2 trainers x 1 seed = 4 jobs** and verify `results.csv` has 4 rows." + ], + "id": "gpkb4-1Em0UW" + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "dMn7PDVgm0UX", + "outputId": "c37fef05-49b8-4180-dbc9-4b32fd20d45c", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "=== 2x2 Matrix Smoke (mode=real) ===\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with only long-term memory.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -3.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -3.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 3\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with only long-term memory.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1000000.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1000000.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code:1: import numpy as np\n", + "import math\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " grid_size = int(np.ceil(np.sqrt(n)))\n", + " radius = 0.5 / grid_size\n", + "\n", + " circles = []\n", + " for i in range(n):\n", + " row = i // grid_size\n", + " col = i % grid_size\n", + " x = (col + 0.5) / grid_size\n", + " y = (row + 0.5) / grid_size\n", + " circles.append([x, y, radius])\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.6499617928349034\n", + "[Step 1] \u001b[94mAlgo/Average train score: -749999.8375095518\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.6499617928349034\n", + "[Step 1] Update/best_candidate_mean_score: 0.6499617928349034\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: -499999.67501910357\n", + "[Step 1] Update/exploration_candidates_mean_score: -499999.67501910357\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: -499999.67501910357\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code:1: import numpy as np\n", + "import math\n", + "\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + "\n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + "\n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + " np.random.seed(2025)\n", + " \n", + " circles = []\n", + " for _ in range(n):\n", + " radius = np.random.rand() * 0.05 # Variable radius, capped to keep circles small\n", + " x, y = np.random.rand(2) * (1 - 2 * radius) + radius # Ensures circles fit in unit square\n", + "\n", + " # Check for overlapping\n", + " while any(np.linalg.norm([x - circle[0], y - circle[1]]) < (radius + circle[2]) for circle in circles):\n", + " x, y = np.random.rand(2) * (1 - 2 * radius) + radius # Reposition if overlap detected\n", + "\n", + " circles.append([x, y, radius])\n", + " \n", + " return np.array(circles)\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.4689943904012859\u001b[0m\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_matrix.yaml <\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
task_idsuitetrainer_idseedstatusscore_best
0internal:numeric_paraminternalPrioritySearch123ok-0.000000
1internal:numeric_paraminternalGEPA-Base123ok-0.000000
2llm4ad:circle_packingllm4adPrioritySearch123ok0.649962
3llm4ad:circle_packingllm4adGEPA-Base123ok1.468994
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df[[\\\"task_id\\\", \\\"suite\\\", \\\"trainer_id\\\", \\\"seed\\\", \\\"status\\\", \\\"score_best\\\"]]\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad:circle_packing\",\n \"internal:numeric_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad\",\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\",\n \"PrioritySearch\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.697113339555075,\n \"min\": -0.0,\n \"max\": 1.468994390401286,\n \"num_unique_values\": 3,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "# Verify 2x2 matrix: exactly 4 rows in results.csv\n", + "import json, pathlib, pandas as pd\n", + "\n", + "runs_root = pathlib.Path(RUNS_DIR)\n", + "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", + "\n", + "matrix_dir = None\n", + "for p in reversed(candidates):\n", + " summary_path = p / \"summary.json\"\n", + " if not summary_path.exists():\n", + " continue\n", + " try:\n", + " summary = json.loads(summary_path.read_text())\n", + " except Exception:\n", + " continue\n", + " if summary.get(\"total_jobs\") == 4:\n", + " matrix_dir = p\n", + " break\n", + "\n", + "if matrix_dir is None:\n", + " raise FileNotFoundError(\"No matrix run with total_jobs==4 found. Re-run the matrix cell.\")\n", + "\n", + "print(\"Matrix run dir:\", matrix_dir)\n", + "\n", + "df = pd.read_csv(matrix_dir / \"results.csv\")\n", + "print(f\"\\nresults.csv rows: {len(df)} (expected: 4)\")\n", + "assert len(df) == 4, f\"Expected 4 rows, got {len(df)}\"\n", + "\n", + "summary = json.loads((matrix_dir / \"summary.json\").read_text())\n", + "print(f\"summary.json: {summary}\")\n", + "assert summary.get(\"total_jobs\") == 4\n", + "\n", + "print(\"\\n--- Matrix results ---\")\n", + "df[[\"task_id\", \"suite\", \"trainer_id\", \"seed\", \"status\", \"score_best\"]]\n" + ], + "id": "W18tGXfYm0UZ" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10" + }, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/notebooks/01_smoke_runner.ipynb b/notebooks/01_smoke_runner.ipynb new file mode 100644 index 0000000..283fb83 --- /dev/null +++ b/notebooks/01_smoke_runner.ipynb @@ -0,0 +1,213 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Trace-Bench Smoke Runner (Stub + Real)\n", + "\n", + "This notebook validates Trace-Bench in two modes:\n", + "\n", + "- **StubLLM**: deterministic, no API keys\n", + "- **Real LLM**: requires a user-provided API key (via Colab Secrets)\n", + "\n", + "It also shows the standardized run artifacts produced by the CLI." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Expected Outputs (Quick Verification)\n", + "\n", + "You should see the following signals if the notebook is working correctly:\n", + "\n", + "- **Stub smoke run** completes with a new `runs//` folder.\n", + "- `config.snapshot.yaml`, `env.json`, `results.csv`, `events.jsonl` exist in that folder.\n", + "- `results.csv` shows at least one row with `task=example:greeting_stub` and `status=trained`.\n", + "- **Real-LLM smoke** completes (if API key is set) and `results.csv` shows `status=trained`.\n", + "- `pytest -q` ends with `passed` (LLM4AD optimizer tests run only when `OPENAI_API_KEY` is set)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mount Drive (optional) + compute persistent runs_dir\n", + "from datetime import date\n", + "from pathlib import Path\n", + "import os\n", + "\n", + "try:\n", + " from google.colab import drive\n", + " drive.mount(\"/content/drive\")\n", + "except Exception:\n", + " pass\n", + "\n", + "\n", + "def bench_dir(project=\"bench\", sub=\"trace_bench\", local=\"/content/bench\"):\n", + " drive = Path(\"/content/drive/MyDrive\")\n", + " root = drive if drive.is_dir() else Path(local)\n", + " out = root / project / date.today().isoformat() / sub\n", + " out.mkdir(parents=True, exist_ok=True)\n", + " return str(out)\n", + "\n", + "RUNS_DIR = bench_dir()\n", + "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n", + "print(\"Runs dir:\", RUNS_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n", + "!git clone --depth 1 --branch runner-foundation https://github.com/guru-code-expert/Trace-Bench.git\n", + "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n", + "\n", + "%cd Trace-Bench\n", + "\n", + "# System + Python deps\n", + "!apt-get update -y && apt-get install -y graphviz\n", + "!python -m pip install -U pip\n", + "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Optional: list tasks (external bench discovery)\n", + "!python -m trace_bench list-tasks --root LLM4AD/benchmark_tasks | head -n 30" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "cd /content/Trace-Bench\n", + "\n", + "# Stub smoke (internal example task for deterministic output)\n", + "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench run --config configs/smoke.yaml --runs-dir \"$RUNS_DIR\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect latest run artifacts\n", + "import glob, json, pathlib, pandas as pd\n", + "\n", + "latest = sorted(glob.glob(f\"{RUNS_DIR}/*\"))[-1]\n", + "p = pathlib.Path(latest)\n", + "print(p)\n", + "\n", + "print((p / \"config.snapshot.yaml\").read_text()[:400])\n", + "print(json.loads((p / \"env.json\").read_text()).keys())\n", + "\n", + "pd.read_csv(p / \"results.csv\").head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "cd /content/Trace-Bench\n", + "\n", + "# Optional: external LLM4AD smoke (may yield low score if template fails)\n", + "cat > configs/smoke_llm4ad.yaml <<'YAML'\n", + "runs_dir: runs\n", + "mode: stub\n", + "seed: 123\n", + "tasks:\n", + " - circle_packing\n", + "trainers:\n", + " - PrioritySearch\n", + "YAML\n", + "\n", + "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench run --config configs/smoke_llm4ad.yaml --runs-dir \"$RUNS_DIR\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Real LLM (requires API key)\n", + "\n", + "Add `OPENAI_API_KEY` in **Colab Secrets** and run the cells below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load API key from Colab Secrets\n", + "from google.colab import userdata\n", + "import os\n", + "\n", + "key = userdata.get(\"OPENAI_API_KEY\")\n", + "if not key:\n", + " raise RuntimeError(\"Missing OPENAI_API_KEY secret in Colab\")\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = key\n", + "os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n", + "os.environ[\"TRACE_LITELLM_MODEL\"] = \"gpt-4o-mini\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "cd /content/Trace-Bench\n", + "\n", + "# Real-LLM smoke (internal example task)\n", + "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench run --config configs/smoke_real.yaml --runs-dir \"$RUNS_DIR\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "cd /content/Trace-Bench\n", + "\n", + "# Pytest (LLM4AD optimizer test runs only if OPENAI_API_KEY is set)\n", + "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m pytest -q" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..be74aa6 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +testpaths = tests +pythonpath = . +addopts = -p no:langsmith diff --git a/setup.py b/setup.py index 30f3fdb..c879a60 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -import os +import os import setuptools here = os.path.abspath(os.path.dirname(__file__)) @@ -15,9 +15,10 @@ "black", "scikit-learn", "tensorboardX", - "tensorboard" + "tensorboard", + "pyyaml", ] - + setuptools.setup( name="trace-bench", version=__version__, @@ -27,7 +28,12 @@ license='MIT LICENSE', description="An AutoDiff-like tool for training AI systems end-to-end with general feedback", long_description=open('README.md', encoding="utf8").read(), - packages=setuptools.find_packages(include=["opto*"]), + packages=setuptools.find_packages(include=["trace_bench*", "opto*"]), install_requires=install_requires, python_requires=">=3.9", + entry_points={ + "console_scripts": [ + "trace-bench=trace_bench.cli:main", + ] + }, ) diff --git a/tests/m0/test_config.py b/tests/m0/test_config.py new file mode 100644 index 0000000..71fabf3 --- /dev/null +++ b/tests/m0/test_config.py @@ -0,0 +1,8 @@ +from trace_bench.config import load_config + + +def test_load_config_smoke(): + cfg = load_config("configs/smoke.yaml") + assert cfg.mode == "stub" + assert cfg.tasks[0].id == "internal:numeric_param" + assert cfg.runs_dir == "runs" diff --git a/tests/m0/test_runner_smoke.py b/tests/m0/test_runner_smoke.py new file mode 100644 index 0000000..d43891b --- /dev/null +++ b/tests/m0/test_runner_smoke.py @@ -0,0 +1,38 @@ +import csv +import os +from pathlib import Path + +import pytest + +from trace_bench.config import load_config +from trace_bench.runner import BenchRunner + + +def test_runner_smoke(tmp_path): + try: + import graphviz # noqa: F401 + except Exception as exc: # pragma: no cover - dependency check + pytest.fail(f"graphviz is required for smoke: {exc}") + repo_root = Path(__file__).resolve().parents[2] + os.chdir(repo_root) + + cfg = load_config("configs/smoke.yaml") + cfg.runs_dir = str(tmp_path / "runs") + + runner = BenchRunner(cfg) + summary = runner.run() + + assert summary.results + run_dir = Path(cfg.runs_dir) / summary.run_id + assert run_dir.exists() + assert (run_dir / "meta" / "config.snapshot.yaml").exists() + assert (run_dir / "meta" / "env.json").exists() + assert (run_dir / "meta" / "manifest.json").exists() + assert (run_dir / "results.csv").exists() + assert (run_dir / "summary.json").exists() + + with (run_dir / "results.csv").open("r", encoding="utf-8") as f: + rows = list(csv.DictReader(f)) + assert rows + assert "job_id" in rows[0] + assert any(row.get("status") != "skipped" for row in rows) diff --git a/tests/m0/test_stub_llm.py b/tests/m0/test_stub_llm.py new file mode 100644 index 0000000..5d6cc21 --- /dev/null +++ b/tests/m0/test_stub_llm.py @@ -0,0 +1,25 @@ +import pytest + +from trace_bench.registry import load_task_bundle + + +def _skip_if_missing_deps(exc: Exception): + msg = str(exc).lower() + if "graphviz" in msg or "opto" in msg: + pytest.skip(f"Optional dependency missing: {exc}") + + +def test_example_tasks_load(): + try: + bundle = load_task_bundle("trace_examples:greeting_stub", "LLM4AD/benchmark_tasks") + except Exception as exc: + _skip_if_missing_deps(exc) + raise + assert {"param", "guide", "train_dataset", "optimizer_kwargs", "metadata"}.issubset(bundle.keys()) + + try: + bundle2 = load_task_bundle("trace_examples:train_single_node_stub", "LLM4AD/benchmark_tasks") + except Exception as exc: + _skip_if_missing_deps(exc) + raise + assert {"param", "guide", "train_dataset", "optimizer_kwargs", "metadata"}.issubset(bundle2.keys()) diff --git a/tests/m1/test_artifacts_layout.py b/tests/m1/test_artifacts_layout.py new file mode 100644 index 0000000..618607e --- /dev/null +++ b/tests/m1/test_artifacts_layout.py @@ -0,0 +1,28 @@ +from pathlib import Path + +from trace_bench.config import load_config +from trace_bench.runner import BenchRunner + + +def test_artifacts_layout(tmp_path): + cfg = load_config("configs/smoke.yaml") + cfg.runs_dir = str(tmp_path / "runs") + + summary = BenchRunner(cfg).run() + run_dir = Path(cfg.runs_dir) / summary.run_id + + assert (run_dir / "meta" / "config.snapshot.yaml").exists() + assert (run_dir / "meta" / "env.json").exists() + assert (run_dir / "meta" / "git.json").exists() + assert (run_dir / "meta" / "manifest.json").exists() + assert (run_dir / "results.csv").exists() + assert (run_dir / "summary.json").exists() + + jobs_dir = run_dir / "jobs" + job_dirs = [p for p in jobs_dir.iterdir() if p.is_dir()] + assert job_dirs, "expected at least one job directory" + job_dir = job_dirs[0] + assert (job_dir / "job_meta.json").exists() + assert (job_dir / "results.json").exists() + assert (job_dir / "events.jsonl").exists() + assert (job_dir / "tb").exists() diff --git a/tests/m1/test_internal_tasks.py b/tests/m1/test_internal_tasks.py new file mode 100644 index 0000000..ac5b674 --- /dev/null +++ b/tests/m1/test_internal_tasks.py @@ -0,0 +1,23 @@ +from trace_bench.config import RunConfig +from trace_bench.registry import load_task_bundle +from trace_bench.runner import BenchRunner + + +def test_internal_tasks_load(): + bundle = load_task_bundle("internal:code_param", "LLM4AD/benchmark_tasks") + assert "param" in bundle + bundle2 = load_task_bundle("internal:numeric_param", "LLM4AD/benchmark_tasks") + assert "param" in bundle2 + + +def test_internal_non_trainable_fails(tmp_path): + cfg = RunConfig.from_dict( + { + "tasks": [{"id": "internal:non_trainable"}], + "trainers": [{"id": "PrioritySearch", "params_variants": [{"ps_steps": 1}]}], + "seeds": [123], + } + ) + cfg.runs_dir = str(tmp_path / "runs") + summary = BenchRunner(cfg).run() + assert any(row.get("status") == "failed" for row in summary.results) diff --git a/tests/m1/test_matrix.py b/tests/m1/test_matrix.py new file mode 100644 index 0000000..766b194 --- /dev/null +++ b/tests/m1/test_matrix.py @@ -0,0 +1,51 @@ +import csv +import json +from pathlib import Path + +from trace_bench.config import RunConfig, load_config +from trace_bench.matrix import compute_job_id, expand_matrix +from trace_bench.runner import BenchRunner + + +def test_expand_matrix_counts(): + cfg = RunConfig.from_dict( + { + "tasks": [{"id": "internal:numeric_param"}, {"id": "internal:code_param"}], + "trainers": [ + {"id": "PrioritySearch", "params_variants": [{}]}, + {"id": "GEPA-Base", "params_variants": [{}]}, + ], + "seeds": [123], + } + ) + jobs = expand_matrix(cfg) + assert len(jobs) == 4 + + +def test_job_id_stable(): + job_id_1 = compute_job_id("internal:numeric_param", "PrioritySearch", {"ps_steps": 1}, 123) + job_id_2 = compute_job_id("internal:numeric_param", "PrioritySearch", {"ps_steps": 1}, 123) + assert job_id_1 == job_id_2 + + +def test_matrix_smoke_e2e(tmp_path): + """Run 2 tasks x 2 trainers x 1 seed = 4 jobs end-to-end and verify results.""" + cfg = load_config("configs/m1_matrix_smoke.yaml") + cfg.runs_dir = str(tmp_path / "runs") + cfg.mode = "stub" + + summary = BenchRunner(cfg).run() + run_dir = Path(cfg.runs_dir) / summary.run_id + + # results.csv must have exactly 4 data rows + results_csv = run_dir / "results.csv" + assert results_csv.exists() + with open(results_csv) as f: + rows = list(csv.DictReader(f)) + assert len(rows) == 4, f"Expected 4 rows in results.csv, got {len(rows)}" + + # summary.json must aggregate 4 jobs + summary_json = run_dir / "summary.json" + assert summary_json.exists() + summary_data = json.loads(summary_json.read_text()) + assert summary_data["total_jobs"] == 4 diff --git a/tests/m1/test_opentrace_examples_smoke.py b/tests/m1/test_opentrace_examples_smoke.py new file mode 100644 index 0000000..f22b275 --- /dev/null +++ b/tests/m1/test_opentrace_examples_smoke.py @@ -0,0 +1,88 @@ +import os +import re +import subprocess +import sys +from pathlib import Path + +import pytest + + +EXAMPLE_ALLOWLIST = { + "autogen", + "datasets", + "dotenv", + "dspy", + "graphviz", + "textgrad", +} + + +def _open_trace_root() -> Path: + repo_root = Path(__file__).resolve().parents[2] + return repo_root.parent / "OpenTrace" + + +def _example_files() -> list[Path]: + root = _open_trace_root() / "examples" + if not root.exists(): + pytest.skip("OpenTrace examples directory not found") + return sorted([p for p in root.rglob("*.py") if p.is_file()]) + + +def _is_argparse_script(path: Path) -> bool: + try: + text = path.read_text(encoding="utf-8") + except Exception: + return False + return "argparse" in text or "ArgumentParser(" in text + + +def _extract_missing_module(output: str) -> str | None: + match = re.search(r"No module named ['\"]([^'\"]+)['\"]", output) + if match: + return match.group(1) + return None + + +def _run_smoke(path: Path): + env = dict(os.environ) + env["PYTHONPATH"] = str(_open_trace_root()) + + env["TRACE_BENCH_SMOKE"] = "1" + + if _is_argparse_script(path): + cmd = [sys.executable, str(path), "--help"] + else: + cmd = [ + sys.executable, + "-c", + f"import runpy; runpy.run_path(r'{path.as_posix()}', run_name='__not_main__')", + ] + + try: + proc = subprocess.run( + cmd, + env=env, + capture_output=True, + text=True, + cwd=str(path.parent), + timeout=30, + ) + return proc + except subprocess.TimeoutExpired: + raise AssertionError(f"Smoke timed out for {path}") + + +@pytest.mark.parametrize("path", _example_files()) +def test_opentrace_examples_smoke(path: Path): + strict = os.environ.get("TRACE_BENCH_STRICT_EXAMPLES") == "1" + proc = _run_smoke(path) + if proc.returncode == 0: + return + + output = (proc.stdout or "") + "\n" + (proc.stderr or "") + missing = _extract_missing_module(output) + if missing and missing in EXAMPLE_ALLOWLIST and not strict: + pytest.skip(f"Optional dependency missing for {path.name}: {missing}") + + raise AssertionError(f"Smoke failed for {path}:\n{output}") diff --git a/tests/m1/test_trainer_config.py b/tests/m1/test_trainer_config.py new file mode 100644 index 0000000..f766c74 --- /dev/null +++ b/tests/m1/test_trainer_config.py @@ -0,0 +1,22 @@ +import pytest + +from trace_bench.config import RunConfig + + +def test_trainer_params_variants_parsed(): + cfg = RunConfig.from_dict( + { + "trainers": [ + { + "id": "PrioritySearch", + "params_variants": [{"ps_steps": 2}], + } + ] + } + ) + assert cfg.trainers[0].params_variants[0]["ps_steps"] == 2 + + +def test_trainer_missing_id_raises(): + with pytest.raises(ValueError): + RunConfig.from_dict({"trainers": [{"params_variants": [{}]}]}) diff --git a/tests/m1/test_veribench_cli.py b/tests/m1/test_veribench_cli.py new file mode 100644 index 0000000..d627e5f --- /dev/null +++ b/tests/m1/test_veribench_cli.py @@ -0,0 +1,17 @@ +import pytest + +from trace_bench.cli import cmd_list_tasks, cmd_validate + + +def test_veribench_list_tasks_explicit_failure(): + with pytest.raises(NotImplementedError) as exc: + cmd_list_tasks("LLM4AD/benchmark_tasks", bench="veribench") + assert "awaiting trace team entrypoint/task list" in str(exc.value).lower() + + +def test_veribench_validate_explicit_failure(tmp_path): + config_path = tmp_path / "empty.yaml" + config_path.write_text("tasks: []\n", encoding="utf-8") + with pytest.raises(NotImplementedError) as exc: + cmd_validate(str(config_path), "LLM4AD/benchmark_tasks", bench="veribench") + assert "awaiting trace team entrypoint/task list" in str(exc.value).lower() diff --git a/tests/test_lite_optimize_llm4ad.py b/tests/test_lite_optimize_llm4ad.py index 39df40c..03994ca 100644 --- a/tests/test_lite_optimize_llm4ad.py +++ b/tests/test_lite_optimize_llm4ad.py @@ -90,6 +90,9 @@ def _get_param_value(param): @pytest.mark.parametrize("task", TASKS) def test_lite_optimize_llm4ad_task(task): + if not os.environ.get("OPENAI_API_KEY"): + pytest.skip("OPENAI_API_KEY not set; skipping LLM-backed optimizer test.") + try: llm4ad_loader = _import_llm4ad_loader() except Exception as exc: diff --git a/trace_bench/__init__.py b/trace_bench/__init__.py new file mode 100644 index 0000000..5899023 --- /dev/null +++ b/trace_bench/__init__.py @@ -0,0 +1,6 @@ +"""Trace-Bench runner package.""" + +from .config import RunConfig, load_config +from .runner import BenchRunner + +__all__ = ["RunConfig", "load_config", "BenchRunner"] diff --git a/trace_bench/__main__.py b/trace_bench/__main__.py new file mode 100644 index 0000000..6dbaea4 --- /dev/null +++ b/trace_bench/__main__.py @@ -0,0 +1,4 @@ +from trace_bench.cli import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/trace_bench/artifacts.py b/trace_bench/artifacts.py new file mode 100644 index 0000000..40126dc --- /dev/null +++ b/trace_bench/artifacts.py @@ -0,0 +1,212 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional +import csv +import json +import os +import subprocess +from datetime import datetime +import platform +import sys + + +@dataclass +class RunArtifacts: + run_dir: Path + meta_dir: Path + jobs_dir: Path + + @property + def config_snapshot(self) -> Path: + return self.meta_dir / "config.snapshot.yaml" + + @property + def env_json(self) -> Path: + return self.meta_dir / "env.json" + + @property + def git_json(self) -> Path: + return self.meta_dir / "git.json" + + @property + def manifest_json(self) -> Path: + return self.meta_dir / "manifest.json" + + @property + def results_csv(self) -> Path: + return self.run_dir / "results.csv" + + @property + def summary_json(self) -> Path: + return self.run_dir / "summary.json" + + +@dataclass +class JobArtifacts: + job_dir: Path + + @property + def job_meta(self) -> Path: + return self.job_dir / "job_meta.json" + + @property + def results_json(self) -> Path: + return self.job_dir / "results.json" + + @property + def events_jsonl(self) -> Path: + return self.job_dir / "events.jsonl" + + @property + def artifacts_dir(self) -> Path: + return self.job_dir / "artifacts" + + @property + def tb_dir(self) -> Path: + return self.job_dir / "tb" + + +def init_run_dir(runs_dir: str, run_id: str) -> RunArtifacts: + run_path = Path(runs_dir) / run_id + meta_dir = run_path / "meta" + jobs_dir = run_path / "jobs" + meta_dir.mkdir(parents=True, exist_ok=True) + jobs_dir.mkdir(parents=True, exist_ok=True) + return RunArtifacts(run_dir=run_path, meta_dir=meta_dir, jobs_dir=jobs_dir) + + +def init_job_dir(run_artifacts: RunArtifacts, job_id: str) -> JobArtifacts: + job_dir = run_artifacts.jobs_dir / job_id + job_dir.mkdir(parents=True, exist_ok=True) + (job_dir / "artifacts").mkdir(parents=True, exist_ok=True) + (job_dir / "tb").mkdir(parents=True, exist_ok=True) + return JobArtifacts(job_dir=job_dir) + + +def _dump_yaml_or_json(data: Dict[str, Any]) -> str: + try: + import yaml # type: ignore + return yaml.safe_dump(data, sort_keys=False) + except Exception: + return json.dumps(data, indent=2, sort_keys=False) + + +def write_config_snapshot(path: Path, data: Dict[str, Any]) -> None: + path.write_text(_dump_yaml_or_json(data), encoding="utf-8") + + +def _git_info() -> Dict[str, Any]: + info: Dict[str, Any] = {} + try: + root = Path(__file__).resolve().parents[1] + info["repo_root"] = str(root) + info["commit"] = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=root).decode().strip() + info["branch"] = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=root).decode().strip() + return info + except Exception: + return info + + +_ENV_ALLOWLIST = { + "TRACE_DEFAULT_LLM_BACKEND", + "TRACE_LITELLM_MODEL", + "TRACE_CUSTOMLLM_MODEL", + "TRACE_CUSTOMLLM_URL", + "CUDA_VISIBLE_DEVICES", + "PYTHONPATH", +} + +_ENV_PREFIX_ALLOWLIST = ( + "TRACE_", + "OPENAI_", + "ANTHROPIC_", + "AZURE_", + "HF_", + "HUGGINGFACE_", +) + +_SENSITIVE_TOKENS = ("KEY", "TOKEN", "SECRET", "PASSWORD") + + +def _is_allowed_env_key(key: str) -> bool: + if key in _ENV_ALLOWLIST: + return True + return any(key.startswith(prefix) for prefix in _ENV_PREFIX_ALLOWLIST) + + +def _redact_env_value(key: str, value: str) -> str: + if any(token in key.upper() for token in _SENSITIVE_TOKENS): + return "***REDACTED***" + return value + + +def write_env_json(path: Path) -> None: + env: Dict[str, str] = {} + for key in sorted(os.environ.keys()): + if _is_allowed_env_key(key): + env[key] = _redact_env_value(key, os.environ.get(key, "")) + payload = { + "captured_at": datetime.utcnow().isoformat() + "Z", + "env": env, + "runtime": { + "python_version": sys.version.split()[0], + "platform": platform.platform(), + }, + } + path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + + +def write_git_json(path: Path) -> None: + path.write_text(json.dumps(_git_info(), indent=2), encoding="utf-8") + +def _dump_json(payload: Dict[str, Any]) -> str: + return json.dumps(payload, indent=2, default=str) + + +def write_manifest(path: Path, manifest: Dict[str, Any]) -> None: + path.write_text(_dump_json(manifest), encoding="utf-8") + + +def write_job_meta(path: Path, job_meta: Dict[str, Any]) -> None: + path.write_text(_dump_json(job_meta), encoding="utf-8") + + +def write_job_results(path: Path, results: Dict[str, Any]) -> None: + path.write_text(_dump_json(results), encoding="utf-8") + + +def append_results_csv(path: Path, fieldnames: List[str], row: Dict[str, Any]) -> None: + write_header = not path.exists() + with path.open("a", encoding="utf-8", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + if write_header: + writer.writeheader() + writer.writerow(row) + + +def append_event(path: Path, event: Dict[str, Any]) -> None: + with path.open("a", encoding="utf-8") as f: + f.write(json.dumps(event, ensure_ascii=False) + "\n") + + +def write_summary(path: Path, summary: Dict[str, Any]) -> None: + path.write_text(_dump_json(summary), encoding="utf-8") + + +__all__ = [ + "RunArtifacts", + "JobArtifacts", + "init_run_dir", + "init_job_dir", + "write_config_snapshot", + "write_env_json", + "write_git_json", + "write_manifest", + "write_job_meta", + "write_job_results", + "append_results_csv", + "append_event", + "write_summary", +] diff --git a/trace_bench/cli.py b/trace_bench/cli.py new file mode 100644 index 0000000..694af6b --- /dev/null +++ b/trace_bench/cli.py @@ -0,0 +1,231 @@ +from __future__ import annotations + +import argparse +from pathlib import Path +import sys + +from trace_bench.config import load_config +from trace_bench.matrix import compute_run_id, expand_matrix +from trace_bench.registry import discover_tasks, discover_trainers, load_task_bundle +from trace_bench.runner import BenchRunner, _has_trainables +from trace_bench.artifacts import init_run_dir, write_manifest +from trace_bench.ui import launch_ui + + +def cmd_list_tasks(root: str, bench: str | None = None) -> int: + specs = discover_tasks(root, bench=bench) + for spec in specs: + print(spec.id) + return 0 + + +def cmd_list_trainers() -> int: + specs = discover_trainers() + for spec in specs: + status = "available" if spec.available else "unavailable" + print(f"{spec.id}\t{status}") + return 0 + + +def _task_in_bench(task_key: str, bench: str | None) -> bool: + if not bench: + return True + if ":" not in task_key: + task_key = f"llm4ad:{task_key}" + if "veribench" in bench and task_key.startswith("veribench:"): + return True + if "trace_examples" in bench and task_key.startswith("trace_examples:"): + return True + if "internal" in bench and task_key.startswith("internal:"): + return True + if "llm4ad" in bench and task_key.startswith("llm4ad:"): + return True + return False + + +_ALLOWED_TRAINER_KWARGS = { + "threads", + "num_epochs", + "num_steps", + "num_batches", + "num_candidates", + "num_proposals", + "num_iters", + "num_search_iterations", + "train_batch_size", + "merge_every", + "pareto_subset_size", + "ps_steps", + "ps_batches", + "ps_candidates", + "ps_proposals", + "ps_mem_update", + "gepa_iters", + "gepa_train_bs", + "gepa_merge_every", + "gepa_pareto_subset", + # LLM4AD pass-through knobs (merged into params_variants by config parser) + "optimizer_kwargs", + "eval_kwargs", +} + + +def _resolve_symbol(module_name: str, symbol: str) -> bool: + try: + module = __import__(module_name, fromlist=[symbol]) + return hasattr(module, symbol) + except Exception: + return False + + +def _validate_trainer_params(trainer, errors: list[str]) -> None: + for params in trainer.params_variants or [{}]: + for key in params.keys(): + if key not in _ALLOWED_TRAINER_KWARGS: + errors.append(f"unknown trainer kwarg '{key}' for {trainer.id}") + + if trainer.optimizer and not _resolve_symbol("opto.optimizers", trainer.optimizer): + errors.append(f"optimizer not found: {trainer.optimizer}") + if trainer.guide and not _resolve_symbol("opto.trainer.guide", trainer.guide): + errors.append(f"guide not found: {trainer.guide}") + if trainer.logger and not _resolve_symbol("opto.trainer.loggers", trainer.logger): + errors.append(f"logger not found: {trainer.logger}") + + +def cmd_validate(config_path: str, root: str, bench: str | None = None, strict: bool = False) -> int: + cfg = load_config(config_path) + tasks_root = Path(root) + errors = 0 + if bench: + discover_tasks(tasks_root, bench=bench) + trainers = discover_trainers() + trainer_ids = {t.id for t in trainers if t.available} + strict_errors: list[str] = [] + for trainer in cfg.trainers: + if trainer.id not in trainer_ids: + errors += 1 + print(f"[FAIL] trainer {trainer.id}: not available") + if strict: + _validate_trainer_params(trainer, strict_errors) + if strict_errors: + for msg in strict_errors: + print(f"[FAIL] {msg}") + errors += len(strict_errors) + + for task in cfg.tasks: + task_id = task.id + if not _task_in_bench(task_id, bench): + continue + try: + bundle = load_task_bundle(task_id, tasks_root, eval_kwargs=task.eval_kwargs) + print(f"[OK] {task_id}") + if strict: + if not _has_trainables(bundle["param"]): + if task_id == "internal:non_trainable": + print(f"[EXPECTED] {task_id}: no_trainable_parameters") + else: + errors += 1 + print(f"[FAIL] {task_id}: no_trainable_parameters") + except NotImplementedError as exc: + print(f"[SKIP] {task_id}: {exc}") + except Exception as exc: + errors += 1 + print(f"[FAIL] {task_id}: {exc}") + + if strict: + jobs = expand_matrix(cfg) + if not jobs: + errors += 1 + print("[FAIL] matrix: no jobs expanded") + else: + print(f"\n[OK] matrix: {len(jobs)} jobs expanded deterministically") + seen_trainers: set[str] = set() + seen_tasks: set[str] = set() + for job in jobs: + seen_trainers.add(job.trainer_id) + seen_tasks.add(job.task_id) + print(f" job {job.job_id}: {job.task_id} x {job.trainer_id} (seed={job.seed})") + print(f"\n tasks: {sorted(seen_tasks)}") + print(f" trainers: {sorted(seen_trainers)}") + run_id = compute_run_id(cfg.snapshot()) + artifacts = init_run_dir(cfg.runs_dir, run_id) + manifest = { + "run_id": run_id, + "jobs": [ + { + "job_id": job.job_id, + "task_id": job.task_id, + "suite": job.suite, + "trainer_id": job.trainer_id, + "seed": job.seed, + "resolved_trainer_kwargs": job.resolved_kwargs.get("trainer_kwargs", {}), + "resolved_optimizer_kwargs": job.resolved_kwargs.get("optimizer_kwargs", {}), + "eval_kwargs": job.resolved_kwargs.get("eval_kwargs", {}), + } + for job in jobs + ], + } + write_manifest(artifacts.manifest_json, manifest) + print(f"[OK] manifest written: {artifacts.manifest_json}") + return 1 if errors else 0 + + +def cmd_run(config_path: str, root: str, runs_dir: str | None = None) -> int: + cfg = load_config(config_path) + if runs_dir: + cfg.runs_dir = runs_dir + runner = BenchRunner(cfg, tasks_root=root) + runner.run() + return 0 + + +def cmd_ui(runs_dir: str) -> int: + return launch_ui(runs_dir) + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="trace-bench") + sub = parser.add_subparsers(dest="cmd", required=True) + + list_p = sub.add_parser("list-tasks", help="List discoverable tasks") + list_p.add_argument("--root", default="LLM4AD/benchmark_tasks") + list_p.add_argument("--bench", default=None, help="Bench selection: llm4ad,trace_examples,internal,veribench") + + list_t = sub.add_parser("list-trainers", help="List discoverable trainers") + + val_p = sub.add_parser("validate", help="Validate tasks in config") + val_p.add_argument("--config", required=True) + val_p.add_argument("--root", default="LLM4AD/benchmark_tasks") + val_p.add_argument("--bench", default=None, help="Bench selection: llm4ad,trace_examples,internal,veribench") + val_p.add_argument("--strict", action="store_true") + + run_p = sub.add_parser("run", help="Run a benchmark config") + run_p.add_argument("--config", required=True) + run_p.add_argument("--root", default="LLM4AD/benchmark_tasks") + run_p.add_argument("--runs-dir", default=None) + + ui_p = sub.add_parser("ui", help="Launch Gradio UI (stub)") + ui_p.add_argument("--runs-dir", default="runs") + + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + if args.cmd == "list-tasks": + return cmd_list_tasks(args.root, args.bench) + if args.cmd == "list-trainers": + return cmd_list_trainers() + if args.cmd == "validate": + return cmd_validate(args.config, args.root, args.bench, args.strict) + if args.cmd == "run": + return cmd_run(args.config, args.root, args.runs_dir) + if args.cmd == "ui": + return cmd_ui(args.runs_dir) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/trace_bench/config.py b/trace_bench/config.py new file mode 100644 index 0000000..301fec8 --- /dev/null +++ b/trace_bench/config.py @@ -0,0 +1,228 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional +import json +import uuid + + +_LLM4AD_KNOBS = { + "threads", + "optimizer_kwargs", + "eval_kwargs", + "ps_steps", + "ps_batches", + "ps_candidates", + "ps_proposals", + "ps_mem_update", + "gepa_iters", + "gepa_train_bs", + "gepa_merge_every", + "gepa_pareto_subset", +} + + +def _load_text(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +def _load_yaml_or_json(path: Path) -> Dict[str, Any]: + text = _load_text(path) + # Prefer YAML if available + try: + import yaml # type: ignore + data = yaml.safe_load(text) + if data is None: + return {} + if not isinstance(data, dict): + raise ValueError("Config must be a mapping at top-level") + return data + except Exception: + # Fallback to JSON for environments without PyYAML + try: + data = json.loads(text) + if not isinstance(data, dict): + raise ValueError("Config must be a mapping at top-level") + return data + except json.JSONDecodeError as exc: + raise ValueError( + f"Failed to parse config {path}. Install PyYAML or use JSON syntax. Error: {exc}" + ) + + +def _as_dict(value: Optional[Dict[str, Any]]) -> Dict[str, Any]: + return dict(value or {}) + + +def _normalize_key(key: str) -> str: + return key.replace("-", "_") + + +def _extract_llm4ad_knobs(data: Dict[str, Any]) -> Dict[str, Any]: + knobs: Dict[str, Any] = {} + for raw_key, value in data.items(): + key = _normalize_key(raw_key) + if key in _LLM4AD_KNOBS: + knobs[key] = value + return knobs + + +@dataclass +class TaskConfig: + id: str + eval_kwargs: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class TrainerConfig: + id: str + params_variants: List[Dict[str, Any]] = field(default_factory=list) + optimizer: Optional[str] = None + optimizer_kwargs: Dict[str, Any] = field(default_factory=dict) + guide: Optional[str] = None + guide_kwargs: Dict[str, Any] = field(default_factory=dict) + logger: Optional[str] = None + logger_kwargs: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class RunConfig: + run_id: Optional[str] = None + runs_dir: str = "runs" + mode: str = "stub" + seeds: List[int] = field(default_factory=lambda: [123]) + max_workers: int = 1 + fail_fast: bool = False + tasks: List[TaskConfig] = field(default_factory=list) + trainers: List[TrainerConfig] = field(default_factory=list) + eval_kwargs: Dict[str, Any] = field(default_factory=dict) + trainer_kwargs: Dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "RunConfig": + runs_dir = data.get("runs_dir", data.get("runs_root", "runs")) + mode = data.get("mode", "stub") + seeds = data.get("seeds") + if seeds is None: + seed = int(data.get("seed", 123)) + seeds = [seed] + else: + seeds = [int(x) for x in (seeds or [])] or [123] + + max_workers = int(data.get("max_workers", data.get("threads", 1))) + fail_fast = bool(data.get("fail_fast", False)) + + default_eval = _as_dict(data.get("eval_kwargs")) + default_trainer_kwargs = _as_dict(data.get("trainer_kwargs")) + default_trainer_kwargs.update(_extract_llm4ad_knobs(data)) + + tasks: List[TaskConfig] = [] + for item in list(data.get("tasks", []) or []): + if isinstance(item, str): + tasks.append(TaskConfig(id=item, eval_kwargs=dict(default_eval))) + elif isinstance(item, dict): + task_id = item.get("id") or item.get("key") or item.get("task") + if not task_id: + raise ValueError(f"Task entry missing id: {item}") + eval_kwargs = dict(default_eval) + eval_kwargs.update(_as_dict(item.get("eval_kwargs"))) + tasks.append(TaskConfig(id=str(task_id), eval_kwargs=eval_kwargs)) + else: + raise ValueError(f"Unsupported task entry: {item}") + + trainers: List[TrainerConfig] = [] + for item in list(data.get("trainers", []) or []): + if isinstance(item, str): + params_variants = [dict(default_trainer_kwargs)] + trainers.append(TrainerConfig(id=item, params_variants=params_variants)) + continue + if not isinstance(item, dict): + raise ValueError(f"Unsupported trainer entry: {item}") + + trainer_id = item.get("id") or item.get("name") or item.get("trainer") or item.get("key") + if not trainer_id: + raise ValueError(f"Trainer entry missing id: {item}") + + params_variants = item.get("params_variants") + if params_variants is None: + params = item.get("params") or item.get("trainer_kwargs") or {} + params_variants = [params] + normalized_variants: List[Dict[str, Any]] = [] + for variant in list(params_variants or [{}]): + merged = dict(default_trainer_kwargs) + merged.update(_extract_llm4ad_knobs(item)) + merged.update(dict(variant or {})) + normalized_variants.append(merged) + + trainers.append( + TrainerConfig( + id=str(trainer_id), + params_variants=normalized_variants, + optimizer=item.get("optimizer"), + optimizer_kwargs=_as_dict(item.get("optimizer_kwargs")), + guide=item.get("guide"), + guide_kwargs=_as_dict(item.get("guide_kwargs")), + logger=item.get("logger"), + logger_kwargs=_as_dict(item.get("logger_kwargs")), + ) + ) + + if not trainers: + trainers = [TrainerConfig(id="PrioritySearch", params_variants=[dict(default_trainer_kwargs)])] + + return cls( + run_id=data.get("run_id"), + runs_dir=runs_dir, + mode=mode, + seeds=seeds, + max_workers=max_workers, + fail_fast=fail_fast, + tasks=tasks, + trainers=trainers, + eval_kwargs=default_eval, + trainer_kwargs=default_trainer_kwargs, + ) + + def ensure_run_id(self) -> str: + if not self.run_id: + self.run_id = str(uuid.uuid4()) + return self.run_id + + def snapshot(self) -> Dict[str, Any]: + return { + "run_id": self.run_id, + "runs_dir": self.runs_dir, + "mode": self.mode, + "seeds": list(self.seeds), + "max_workers": self.max_workers, + "fail_fast": self.fail_fast, + "tasks": [ + {"id": task.id, "eval_kwargs": dict(task.eval_kwargs)} + for task in self.tasks + ], + "trainers": [ + { + "id": trainer.id, + "params_variants": [dict(p) for p in trainer.params_variants], + "optimizer": trainer.optimizer, + "optimizer_kwargs": dict(trainer.optimizer_kwargs), + "guide": trainer.guide, + "guide_kwargs": dict(trainer.guide_kwargs), + "logger": trainer.logger, + "logger_kwargs": dict(trainer.logger_kwargs), + } + for trainer in self.trainers + ], + "eval_kwargs": dict(self.eval_kwargs), + "trainer_kwargs": dict(self.trainer_kwargs), + } + + +def load_config(path: str | Path) -> RunConfig: + config_path = Path(path) + data = _load_yaml_or_json(config_path) + return RunConfig.from_dict(data) + + +__all__ = ["RunConfig", "TaskConfig", "TrainerConfig", "load_config"] diff --git a/trace_bench/examples/__init__.py b/trace_bench/examples/__init__.py new file mode 100644 index 0000000..83e54f4 --- /dev/null +++ b/trace_bench/examples/__init__.py @@ -0,0 +1 @@ +"""Example tasks for Trace-Bench.""" diff --git a/trace_bench/examples/greeting_stub.py b/trace_bench/examples/greeting_stub.py new file mode 100644 index 0000000..9c119f8 --- /dev/null +++ b/trace_bench/examples/greeting_stub.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from opto import trace +from opto.trainer.guide import Guide + + +class ExactMatchGuide(Guide): + def get_feedback(self, query: str, response: str, reference: str, **kwargs): + score = 1.0 if response == reference else 0.0 + feedback = "Correct" if score == 1.0 else f"Expected: {reference}" + return score, feedback + + +@trace.model +class GreetingAgent: + def __init__(self): + self.greeting = trace.node("Hello", trainable=True) + + def __call__(self, user_query: str): + name = user_query.split()[-1].strip("!.?") + return self.compose(self.greeting, name) + + @trace.bundle(trainable=True) + def compose(self, greeting, name: str): + greeting_value = getattr(greeting, "data", greeting) + return f"{greeting_value}, {name}!" + + +def build_trace_problem(**override_eval_kwargs): + agent = GreetingAgent() + guide = ExactMatchGuide() + train_dataset = dict( + inputs=["Hello I am Sam"], + infos=["Hello, Sam!"], + ) + optimizer_kwargs = dict( + objective="Generate a correct greeting using the name from the query.", + memory_size=5, + ) + return dict( + param=agent, + guide=guide, + train_dataset=train_dataset, + optimizer_kwargs=optimizer_kwargs, + metadata=dict(benchmark="example", entry="GreetingAgent"), + ) + + +__all__ = ["build_trace_problem", "GreetingAgent"] diff --git a/trace_bench/examples/internal_code_param.py b/trace_bench/examples/internal_code_param.py new file mode 100644 index 0000000..c9c78ce --- /dev/null +++ b/trace_bench/examples/internal_code_param.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from opto import trace +from opto.trainer.guide import Guide + + +class CodeExactGuide(Guide): + def get_feedback(self, _query, response, reference, **_kwargs): + score = 1.0 if response == reference else 0.0 + feedback = "Correct" if score == 1.0 else "Mismatch" + return score, feedback + + +@trace.model +class CodeParamAgent: + def __init__(self): + self.code = trace.node("def f(x): return x", trainable=True) + + def __call__(self, _input): + return self.emit(self.code) + + @trace.bundle(trainable=True) + def emit(self, code): + return code + + +def build_trace_problem(**_override_eval_kwargs): + agent = CodeParamAgent() + guide = CodeExactGuide() + train_dataset = dict(inputs=[None], infos=["def f(x): return x"]) + optimizer_kwargs = dict(objective="Match the target code exactly.", memory_size=5) + return dict( + param=agent, + guide=guide, + train_dataset=train_dataset, + optimizer_kwargs=optimizer_kwargs, + metadata=dict(benchmark="internal", entry="CodeParamAgent"), + ) + + +__all__ = ["build_trace_problem", "CodeParamAgent"] diff --git a/trace_bench/examples/internal_multi_param.py b/trace_bench/examples/internal_multi_param.py new file mode 100644 index 0000000..d598954 --- /dev/null +++ b/trace_bench/examples/internal_multi_param.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from opto import trace +from opto.trainer.guide import Guide + + +class SumGuide(Guide): + def get_feedback(self, _query, response, reference, **_kwargs): + try: + score = -abs(float(response) - float(reference)) + except Exception: + score = -1.0 + feedback = f"target={reference}" + return score, feedback + + +@trace.model +class MultiParamAgent: + def __init__(self): + self.a = trace.node(1.0, trainable=True) + self.b = trace.node(1.0, trainable=True) + + def __call__(self, _input): + return self.combine(self.a, self.b) + + @trace.bundle(trainable=True) + def combine(self, a, b): + return float(getattr(a, "data", a)) + float(getattr(b, "data", b)) + + +def build_trace_problem(**_override_eval_kwargs): + agent = MultiParamAgent() + guide = SumGuide() + train_dataset = dict(inputs=[None], infos=[3.0]) + optimizer_kwargs = dict(objective="Make a+b match the target value.", memory_size=5) + return dict( + param=agent, + guide=guide, + train_dataset=train_dataset, + optimizer_kwargs=optimizer_kwargs, + metadata=dict(benchmark="internal", entry="MultiParamAgent"), + ) + + +__all__ = ["build_trace_problem", "MultiParamAgent"] diff --git a/trace_bench/examples/internal_non_trainable.py b/trace_bench/examples/internal_non_trainable.py new file mode 100644 index 0000000..08cec8b --- /dev/null +++ b/trace_bench/examples/internal_non_trainable.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from opto import trace +from opto.trainer.guide import Guide + + +class NoTrainGuide(Guide): + def get_feedback(self, _query, response, reference, **_kwargs): + score = 1.0 if response == reference else 0.0 + feedback = "Correct" if score == 1.0 else "Mismatch" + return score, feedback + + +@trace.model +class NonTrainableAgent: + def __init__(self): + self.value = trace.node("fixed", trainable=False) + + def __call__(self, _input): + return self.emit(self.value) + + @trace.bundle(trainable=False) + def emit(self, value): + return value + + +def build_trace_problem(**_override_eval_kwargs): + agent = NonTrainableAgent() + guide = NoTrainGuide() + train_dataset = dict(inputs=[None], infos=["fixed"]) + optimizer_kwargs = dict(objective="This should fail due to no trainables.", memory_size=1) + return dict( + param=agent, + guide=guide, + train_dataset=train_dataset, + optimizer_kwargs=optimizer_kwargs, + metadata=dict(benchmark="internal", entry="NonTrainableAgent"), + ) + + +__all__ = ["build_trace_problem", "NonTrainableAgent"] diff --git a/trace_bench/examples/internal_numeric_param.py b/trace_bench/examples/internal_numeric_param.py new file mode 100644 index 0000000..22d1a21 --- /dev/null +++ b/trace_bench/examples/internal_numeric_param.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from opto import trace +from opto.trainer.guide import Guide + + +class NumericGuide(Guide): + def get_feedback(self, _query, response, reference, **_kwargs): + try: + score = -abs(float(response) - float(reference)) + except Exception: + score = -1.0 + feedback = f"target={reference}" + return score, feedback + + +@trace.model +class NumericParamAgent: + def __init__(self): + self.value = trace.node(0.0, trainable=True) + + def __call__(self, _input): + return self.emit(self.value) + + @trace.bundle(trainable=True) + def emit(self, value): + return value + + +def build_trace_problem(**_override_eval_kwargs): + agent = NumericParamAgent() + guide = NumericGuide() + train_dataset = dict(inputs=[None], infos=[3.0]) + optimizer_kwargs = dict(objective="Match the numeric target value.", memory_size=5) + return dict( + param=agent, + guide=guide, + train_dataset=train_dataset, + optimizer_kwargs=optimizer_kwargs, + metadata=dict(benchmark="internal", entry="NumericParamAgent"), + ) + + +__all__ = ["build_trace_problem", "NumericParamAgent"] diff --git a/trace_bench/examples/train_single_node_stub.py b/trace_bench/examples/train_single_node_stub.py new file mode 100644 index 0000000..e7f141f --- /dev/null +++ b/trace_bench/examples/train_single_node_stub.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from opto import trace +from opto.trainer.guide import Guide + + +class RegressionGuide(Guide): + def get_feedback(self, query, response, reference, **kwargs): + try: + score = -abs(float(response) - float(reference)) + except Exception: + score = -1.0 + feedback = f"target={reference}" + return score, feedback + + +@trace.model +class SingleNodeAgent: + def __init__(self): + self.guess = trace.node(0.0, trainable=True) + + def __call__(self, _input): + return self.output(self.guess) + + @trace.bundle(trainable=True) + def output(self, guess): + return guess + + +def build_trace_problem(**override_eval_kwargs): + agent = SingleNodeAgent() + guide = RegressionGuide() + train_dataset = dict( + inputs=[None], + infos=[3.0], + ) + optimizer_kwargs = dict( + objective="Match the target scalar value.", + memory_size=5, + ) + return dict( + param=agent, + guide=guide, + train_dataset=train_dataset, + optimizer_kwargs=optimizer_kwargs, + metadata=dict(benchmark="example", entry="SingleNodeAgent"), + ) + + +__all__ = ["build_trace_problem", "SingleNodeAgent"] diff --git a/trace_bench/matrix.py b/trace_bench/matrix.py new file mode 100644 index 0000000..158f0df --- /dev/null +++ b/trace_bench/matrix.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, List, Optional +import hashlib +import json +import subprocess + +from trace_bench.config import RunConfig, TaskConfig, TrainerConfig + + +def _git_sha() -> str: + try: + return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip() + except Exception: + return "unknown" + + +def _stable_hash(payload: Dict[str, Any], length: int = 8) -> str: + data = json.dumps(payload, sort_keys=True, default=str).encode("utf-8") + return hashlib.sha256(data).hexdigest()[:length] + + +def compute_run_id(config_snapshot: Dict[str, Any], git_sha: Optional[str] = None) -> str: + timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S") + payload = {"config": config_snapshot, "git": git_sha or _git_sha()} + return f"{timestamp}-{_stable_hash(payload, 8)}" + + +def compute_job_id(task_id: str, trainer_id: str, resolved_kwargs: Dict[str, Any], seed: int) -> str: + payload = { + "task_id": task_id, + "trainer_id": trainer_id, + "resolved_kwargs": resolved_kwargs, + "seed": seed, + } + return _stable_hash(payload, 12) + + +def task_suite(task_id: str) -> str: + if ":" in task_id: + return task_id.split(":", 1)[0] + return "llm4ad" + + +def resolve_job_kwargs(task: TaskConfig, trainer: TrainerConfig, params: Dict[str, Any]) -> Dict[str, Any]: + return { + "trainer_kwargs": dict(params), + "optimizer": trainer.optimizer, + "optimizer_kwargs": dict(trainer.optimizer_kwargs or {}), + "guide": trainer.guide, + "guide_kwargs": dict(trainer.guide_kwargs or {}), + "logger": trainer.logger, + "logger_kwargs": dict(trainer.logger_kwargs or {}), + "eval_kwargs": dict(task.eval_kwargs or {}), + } + + +@dataclass +class JobSpec: + job_id: str + task: TaskConfig + trainer: TrainerConfig + seed: int + params: Dict[str, Any] + resolved_kwargs: Dict[str, Any] + + @property + def task_id(self) -> str: + return self.task.id + + @property + def trainer_id(self) -> str: + return self.trainer.id + + @property + def suite(self) -> str: + return task_suite(self.task_id) + + +def expand_matrix(config: RunConfig) -> List[JobSpec]: + jobs: List[JobSpec] = [] + for task in config.tasks: + for trainer in config.trainers: + variants = trainer.params_variants or [{}] + for params in variants: + for seed in config.seeds: + resolved = resolve_job_kwargs(task, trainer, params) + job_id = compute_job_id(task.id, trainer.id, resolved, seed) + jobs.append( + JobSpec( + job_id=job_id, + task=task, + trainer=trainer, + seed=seed, + params=params, + resolved_kwargs=resolved, + ) + ) + return jobs diff --git a/trace_bench/registry.py b/trace_bench/registry.py new file mode 100644 index 0000000..66a10a7 --- /dev/null +++ b/trace_bench/registry.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Set +import importlib +import importlib.util +import json +import sys + + +@dataclass +class TaskSpec: + id: str + suite: str + module: str + + +@dataclass +class TrainerSpec: + id: str + source: str + available: bool + + +_INTERNAL_TASKS = { + "internal:code_param": "internal_code_param", + "internal:numeric_param": "internal_numeric_param", + "internal:multi_param": "internal_multi_param", + "internal:non_trainable": "internal_non_trainable", +} + +def _repo_root() -> Path: + return Path(__file__).resolve().parents[1] + + +def _ensure_sys_path(path: Path) -> None: + if path.exists(): + path_str = str(path) + if path_str not in sys.path: + sys.path.insert(0, path_str) + + +def ensure_opto_importable() -> None: + try: + import opto # noqa: F401 + return + except Exception: + pass + repo_root = _repo_root() + _ensure_sys_path(repo_root.parent / "OpenTrace") + + +def ensure_llm4ad_importable(tasks_root: Path) -> None: + _ensure_sys_path(_repo_root()) + _ensure_sys_path(tasks_root.parent) + # Provide llm4ad_loader alias for task imports + try: + module = importlib.import_module("LLM4AD.llm4ad_loader") + sys.modules.setdefault("llm4ad_loader", module) + except Exception: + pass + + +def _load_index(tasks_root: Path) -> List[Dict[str, Any]]: + index_path = tasks_root / "index.json" + if not index_path.exists(): + return [] + return json.loads(index_path.read_text(encoding="utf-8")) + + +def discover_llm4ad(tasks_root: Path) -> List[TaskSpec]: + specs: List[TaskSpec] = [] + index = _load_index(tasks_root) + if index: + for entry in index: + key = entry.get("key") + module = entry.get("module") or entry.get("wrapper") + if key and module: + specs.append(TaskSpec(id=f"llm4ad:{key}", suite="llm4ad", module=module)) + return specs + # fallback: directories + for path in tasks_root.iterdir(): + if path.is_dir(): + specs.append(TaskSpec(id=f"llm4ad:{path.name}", suite="llm4ad", module=path.name)) + return specs + + +def discover_trace_examples() -> List[TaskSpec]: + return [ + TaskSpec(id="trace_examples:greeting_stub", suite="trace_examples", module="greeting_stub"), + TaskSpec(id="trace_examples:train_single_node_stub", suite="trace_examples", module="train_single_node_stub"), + ] + + +def discover_internal() -> List[TaskSpec]: + return [ + TaskSpec(id=task_id, suite="internal", module=module) + for task_id, module in _INTERNAL_TASKS.items() + ] + +def discover_veribench() -> List[TaskSpec]: + raise NotImplementedError("VeriBench tasks not yet wired: awaiting Trace team entrypoint/task list.") + + +def discover_trainers() -> List[TrainerSpec]: + ensure_opto_importable() + candidates = [ + ("PrioritySearch", "opto.features.priority_search", "PrioritySearch"), + ("GEPA-Base", "opto.features.gepa.gepa_algorithms", "GEPAAlgorithmBase"), + ("GEPA-UCB", "opto.features.gepa.gepa_algorithms", "GEPAUCBSearch"), + ("GEPA-Beam", "opto.features.gepa.gepa_algorithms", "GEPABeamPareto"), + ] + specs: List[TrainerSpec] = [] + for trainer_id, module, symbol in candidates: + available = True + try: + mod = importlib.import_module(module) + getattr(mod, symbol) + except Exception: + available = False + specs.append(TrainerSpec(id=trainer_id, source=module, available=available)) + return specs + + +def _parse_bench(bench: Optional[str]) -> Set[str]: + if not bench: + return {"llm4ad", "trace_examples", "internal"} + normalized = bench.replace("+", ",") + parts = [p.strip() for p in normalized.split(",") if p.strip()] + if not parts: + return {"llm4ad", "trace_examples", "internal"} + allowed = {"llm4ad", "trace_examples", "internal", "veribench"} + unknown = [p for p in parts if p not in allowed] + if unknown: + raise ValueError(f"Unknown bench selector(s): {unknown}. Allowed: {sorted(allowed)}") + return set(parts) + + +def discover_tasks(tasks_root: str | Path, bench: Optional[str] = None) -> List[TaskSpec]: + root = Path(tasks_root) + selected = _parse_bench(bench) + specs: List[TaskSpec] = [] + if "llm4ad" in selected: + specs.extend(discover_llm4ad(root)) + if "trace_examples" in selected: + specs.extend(discover_trace_examples()) + if "internal" in selected: + specs.extend(discover_internal()) + if "veribench" in selected: + specs.extend(discover_veribench()) + return specs + + +def _normalize_task_id(task_id: str) -> str: + if task_id.startswith("example:"): + return task_id.replace("example:", "trace_examples:", 1) + if ":" in task_id: + return task_id + return f"llm4ad:{task_id}" + + +def load_task_module(task_id: str, tasks_root: str | Path): + ensure_opto_importable() + root = Path(tasks_root) + task_id = _normalize_task_id(task_id) + if task_id.startswith("trace_examples:"): + module_name = task_id.split(":", 1)[1] + return importlib.import_module(f"trace_bench.examples.{module_name}") + if task_id.startswith("internal:"): + module_name = _INTERNAL_TASKS.get(task_id, task_id.split(":", 1)[1]) + return importlib.import_module(f"trace_bench.examples.{module_name}") + if task_id.startswith("veribench:"): + raise NotImplementedError("VeriBench tasks not yet wired: awaiting Trace team entrypoint/task list.") + + ensure_llm4ad_importable(root) + mapping = {spec.id.split(":", 1)[1]: spec.module for spec in discover_llm4ad(root)} + task_key = task_id.split(":", 1)[1] + module_dir = mapping.get(task_key, task_key) + module_path = root / module_dir / "__init__.py" + if not module_path.exists(): + raise FileNotFoundError(f"Task module not found: {module_path}") + + module_name = f"trace_bench_task_{module_dir}_{abs(hash(str(module_path)))}" + spec = importlib.util.spec_from_file_location(module_name, str(module_path)) + if spec is None or spec.loader is None: + raise ImportError(f"Could not load spec for {module_path}") + mod = importlib.util.module_from_spec(spec) + sys.modules[module_name] = mod + spec.loader.exec_module(mod) + return mod + + +def load_task_bundle(task_id: str, tasks_root: str | Path, eval_kwargs: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + task_id = _normalize_task_id(task_id) + if task_id.startswith("veribench:"): + raise NotImplementedError("VeriBench tasks not yet wired: awaiting Trace team entrypoint/task list.") + mod = load_task_module(task_id, tasks_root) + if not hasattr(mod, "build_trace_problem"): + raise AttributeError(f"Task module {task_id} missing build_trace_problem") + bundle = mod.build_trace_problem(**(eval_kwargs or {})) + required = {"param", "guide", "train_dataset", "optimizer_kwargs", "metadata"} + missing = required - set(bundle.keys()) + if missing: + raise KeyError(f"Task bundle missing keys: {sorted(missing)}") + return bundle + + +__all__ = [ + "TaskSpec", + "TrainerSpec", + "discover_tasks", + "discover_trainers", + "discover_veribench", + "load_task_bundle", + "load_task_module", +] diff --git a/trace_bench/results.py b/trace_bench/results.py new file mode 100644 index 0000000..3fcb4a9 --- /dev/null +++ b/trace_bench/results.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from typing import Any, Dict, List +import json + + +RESULT_COLUMNS = [ + "run_id", + "job_id", + "task_id", + "suite", + "trainer_id", + "seed", + "status", + "score_initial", + "score_final", + "score_best", + "time_seconds", + "resolved_trainer_kwargs", + "resolved_optimizer_kwargs", + "eval_kwargs", + "feedback", + "tb_logdir", +] + + +def _json_cell(value: Any) -> str: + try: + return json.dumps(value, sort_keys=True) + except Exception: + return json.dumps(str(value)) + + +def build_results_row( + run_id: str, + job_id: str, + task_id: str, + suite: str, + trainer_id: str, + seed: int, + status: str, + score_initial: Any, + score_final: Any, + score_best: Any, + time_seconds: float, + resolved_trainer_kwargs: Dict[str, Any], + resolved_optimizer_kwargs: Dict[str, Any], + eval_kwargs: Dict[str, Any], + feedback: str | None, + tb_logdir: str, +) -> Dict[str, Any]: + return { + "run_id": run_id, + "job_id": job_id, + "task_id": task_id, + "suite": suite, + "trainer_id": trainer_id, + "seed": seed, + "status": status, + "score_initial": score_initial, + "score_final": score_final, + "score_best": score_best, + "time_seconds": round(time_seconds, 6), + "resolved_trainer_kwargs": _json_cell(resolved_trainer_kwargs), + "resolved_optimizer_kwargs": _json_cell(resolved_optimizer_kwargs), + "eval_kwargs": _json_cell(eval_kwargs), + "feedback": feedback or "", + "tb_logdir": tb_logdir, + } + + +def summarize_results(rows: List[Dict[str, Any]]) -> Dict[str, Any]: + counts: Dict[str, int] = {"ok": 0, "failed": 0, "skipped": 0} + for row in rows: + status = row.get("status") or "ok" + if status not in counts: + counts[status] = 0 + counts[status] += 1 + return {"counts": counts, "total_jobs": len(rows)} + + +__all__ = ["RESULT_COLUMNS", "build_results_row", "summarize_results"] diff --git a/trace_bench/runner.py b/trace_bench/runner.py new file mode 100644 index 0000000..6581c4e --- /dev/null +++ b/trace_bench/runner.py @@ -0,0 +1,334 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional +import random +import time + +from trace_bench.artifacts import ( + RunArtifacts, + append_event, + append_results_csv, + init_job_dir, + init_run_dir, + write_config_snapshot, + write_env_json, + write_git_json, + write_manifest, + write_job_meta, + write_job_results, + write_summary, +) +from trace_bench.config import RunConfig, TrainerConfig +from trace_bench.matrix import JobSpec, compute_run_id, expand_matrix +from trace_bench.registry import load_task_bundle +from trace_bench.results import RESULT_COLUMNS, build_results_row, summarize_results + + +try: + from opto.trace.nodes import ParameterNode +except Exception: # pragma: no cover - only when opto is not available + ParameterNode = object # type: ignore + + +@dataclass +class RunSummary: + run_id: str + results: List[Dict[str, Any]] + + +def _extract_response(model: Any, input_value: Any) -> Any: + if isinstance(model, ParameterNode): + return getattr(model, "data", model) + if callable(model): + output = model(input_value) + return getattr(output, "data", output) + return getattr(model, "data", model) + + +def _evaluate_bundle(bundle: Dict[str, Any]) -> Dict[str, Any]: + dataset = bundle["train_dataset"] + guide = bundle["guide"] + inputs = dataset.get("inputs") or [] + infos = dataset.get("infos") or [] + if not inputs or not infos: + return {"score": None, "feedback": "empty_dataset"} + task_input = inputs[0] + task_info = infos[0] + response = _extract_response(bundle["param"], task_input) + try: + score, feedback = guide(task_input, response, task_info) + except Exception as exc: + return {"score": None, "feedback": f"eval_error: {exc}"} + return {"score": score, "feedback": feedback} + + +def _resolve_algorithm(name: str): + if name == "PrioritySearch": + return "PrioritySearch" + if name == "GEPA-Base": + from opto.features.gepa.gepa_algorithms import GEPAAlgorithmBase + return GEPAAlgorithmBase + if name == "GEPA-UCB": + from opto.features.gepa.gepa_algorithms import GEPAUCBSearch + return GEPAUCBSearch + if name == "GEPA-Beam": + from opto.features.gepa.gepa_algorithms import GEPABeamPareto + return GEPABeamPareto + return name + + +def _default_trainer_kwargs(algo_name: str) -> Dict[str, Any]: + if algo_name == "PrioritySearch": + return dict(num_epochs=1, num_steps=1, num_batches=1, num_candidates=2, num_proposals=2) + if algo_name == "GEPA-Base": + return dict(num_iters=1, train_batch_size=2, merge_every=2, pareto_subset_size=2) + # GEPA-UCB and GEPA-Beam use num_search_iterations + return dict(num_search_iterations=1, train_batch_size=2, merge_every=2, pareto_subset_size=2) + + +def _param_alias_map(algo_name: str) -> Dict[str, str]: + """Return config-alias → opto-kwarg mapping for the given algorithm.""" + base = { + "ps_steps": "num_steps", + "ps_batches": "num_batches", + "ps_candidates": "num_candidates", + "ps_proposals": "num_proposals", + "ps_mem_update": "memory_update_frequency", + "gepa_train_bs": "train_batch_size", + "gepa_merge_every": "merge_every", + "gepa_pareto_subset": "pareto_subset_size", + } + if algo_name == "GEPA-Base": + base["gepa_iters"] = "num_iters" + else: + base["gepa_iters"] = "num_search_iterations" + return base + + +# Keys that should NOT be passed to opto_trainer.train() +_FILTERED_KWARGS = {"eval_kwargs", "optimizer_kwargs", "threads"} + + +def _resolve_train_kwargs(params: Dict[str, Any], algo_name: str) -> Dict[str, Any]: + """Map config aliases to actual train() kwargs and filter non-train keys.""" + kwargs = _default_trainer_kwargs(algo_name) + alias_map = _param_alias_map(algo_name) + for key, value in params.items(): + if key in _FILTERED_KWARGS: + continue + mapped_key = alias_map.get(key, key) + kwargs[mapped_key] = value + return kwargs + + +def _train_bundle(bundle: Dict[str, Any], trainer_spec: TrainerConfig, params: Dict[str, Any], mode: str) -> Dict[str, Any]: + from opto import trainer as opto_trainer + + algo_name = trainer_spec.id + algo = _resolve_algorithm(algo_name) + kwargs = _resolve_train_kwargs(params, algo_name) + + optimizer = trainer_spec.optimizer + guide = trainer_spec.guide or bundle["guide"] + logger = trainer_spec.logger or "ConsoleLogger" + guide_kwargs = trainer_spec.guide_kwargs or {} + logger_kwargs = trainer_spec.logger_kwargs or {} + + optimizer_kwargs = bundle.get("optimizer_kwargs", {}) + override_opt_kwargs = trainer_spec.optimizer_kwargs or None + if override_opt_kwargs: + optimizer_kwargs = override_opt_kwargs + if isinstance(optimizer_kwargs, dict): + optimizer_kwargs = dict(optimizer_kwargs) + + if mode == "stub": + try: + from opto.utils.llm import DummyLLM + + def _dummy_response(*_args, **_kwargs): + return '{"suggestion": {}}' + + dummy = DummyLLM(_dummy_response) + if isinstance(optimizer_kwargs, list): + for item in optimizer_kwargs: + item.setdefault("llm", dummy) + elif isinstance(optimizer_kwargs, dict): + optimizer_kwargs.setdefault("llm", dummy) + except Exception: + pass + + try: + opto_trainer.train( + model=bundle["param"], + train_dataset=bundle["train_dataset"], + algorithm=algo, + guide=guide, + optimizer=optimizer, + logger=logger, + optimizer_kwargs=optimizer_kwargs, + guide_kwargs=guide_kwargs, + logger_kwargs=logger_kwargs, + **kwargs, + ) + return {"status": "ok", "optimizer_kwargs": optimizer_kwargs, "trainer_kwargs": kwargs} + except Exception as exc: + return {"status": "failed", "error": str(exc), "optimizer_kwargs": optimizer_kwargs, "trainer_kwargs": kwargs} + + +def _has_trainables(model: Any) -> bool: + if isinstance(model, ParameterNode): + return bool(getattr(model, "trainable", True)) + if hasattr(model, "parameters"): + try: + params = model.parameters() + return any(getattr(p, "trainable", False) for p in params) + except Exception: + return True + return True + + +class BenchRunner: + def __init__(self, config: RunConfig, tasks_root: str | Path = "LLM4AD/benchmark_tasks"): + self.config = config + self.tasks_root = Path(tasks_root) + random.seed(self.config.seeds[0] if self.config.seeds else 123) + self.artifacts: Optional[RunArtifacts] = None + + def run(self) -> RunSummary: + snapshot = self.config.snapshot() + run_id = self.config.run_id or compute_run_id({k: v for k, v in snapshot.items() if k != "run_id"}) + self.config.run_id = run_id + snapshot = self.config.snapshot() + + self.artifacts = init_run_dir(self.config.runs_dir, run_id) + write_config_snapshot(self.artifacts.config_snapshot, snapshot) + write_env_json(self.artifacts.env_json) + write_git_json(self.artifacts.git_json) + + jobs = expand_matrix(self.config) + manifest = { + "run_id": run_id, + "generated_at": datetime.utcnow().isoformat() + "Z", + "jobs": [ + { + "job_id": job.job_id, + "task_id": job.task_id, + "suite": job.suite, + "trainer_id": job.trainer_id, + "seed": job.seed, + "resolved_trainer_kwargs": job.resolved_kwargs.get("trainer_kwargs", {}), + "resolved_optimizer_kwargs": job.resolved_kwargs.get("optimizer_kwargs", {}), + "eval_kwargs": job.resolved_kwargs.get("eval_kwargs", {}), + } + for job in jobs + ], + } + write_manifest(self.artifacts.manifest_json, manifest) + + results: List[Dict[str, Any]] = [] + for job in jobs: + results.append(self._run_job(job)) + if self.config.fail_fast and results[-1].get("status") == "failed": + break + + write_summary(self.artifacts.summary_json, summarize_results(results)) + return RunSummary(run_id=run_id, results=results) + + def _run_job(self, job: JobSpec) -> Dict[str, Any]: + assert self.artifacts is not None + job_artifacts = init_job_dir(self.artifacts, job.job_id) + start_time = time.time() + status = "ok" + feedback: Optional[str] = None + + try: + bundle = load_task_bundle(job.task_id, self.tasks_root, eval_kwargs=job.task.eval_kwargs) + except NotImplementedError as exc: + status = "skipped" + feedback = str(exc) + bundle = None + except Exception as exc: + status = "failed" + feedback = f"task_load_error: {exc}" + bundle = None + + score_initial = None + score_final = None + score_best = None + resolved_optimizer_kwargs: Dict[str, Any] = dict(job.trainer.optimizer_kwargs or {}) + resolved_trainer_kwargs: Dict[str, Any] = dict(job.params) + + if bundle is not None and status == "ok": + if not _has_trainables(bundle["param"]): + status = "failed" + feedback = "no_trainable_parameters" + else: + initial = _evaluate_bundle(bundle) + score_initial = initial.get("score") + train_result = _train_bundle(bundle, job.trainer, job.params, self.config.mode) + status = train_result.get("status", "ok") + resolved_optimizer_kwargs = train_result.get("optimizer_kwargs") or {} + resolved_trainer_kwargs = train_result.get("trainer_kwargs") or resolved_trainer_kwargs + if status == "failed": + feedback = f"training_error: {train_result.get('error', 'unknown')}" + final = _evaluate_bundle(bundle) + score_final = final.get("score") + if status != "failed": + feedback = final.get("feedback") or feedback + + if isinstance(score_initial, (int, float)) and isinstance(score_final, (int, float)): + score_best = max(score_initial, score_final) + else: + score_best = score_final if score_final is not None else score_initial + + elapsed = time.time() - start_time + tb_rel = str(Path("jobs") / job.job_id / "tb") + row = build_results_row( + run_id=self.config.run_id or "", + job_id=job.job_id, + task_id=job.task_id, + suite=job.suite, + trainer_id=job.trainer_id, + seed=job.seed, + status=status, + score_initial=score_initial, + score_final=score_final, + score_best=score_best, + time_seconds=elapsed, + resolved_trainer_kwargs=resolved_trainer_kwargs, + resolved_optimizer_kwargs=resolved_optimizer_kwargs, + eval_kwargs=job.task.eval_kwargs, + feedback=feedback, + tb_logdir=tb_rel, + ) + job_meta = { + "job_id": job.job_id, + "task_id": job.task_id, + "suite": job.suite, + "trainer_id": job.trainer_id, + "seed": job.seed, + "status": status, + "params": job.params, + "resolved_trainer_kwargs": resolved_trainer_kwargs, + "resolved_optimizer_kwargs": resolved_optimizer_kwargs, + "optimizer": job.trainer.optimizer, + "optimizer_kwargs": job.trainer.optimizer_kwargs, + "guide": job.trainer.guide, + "guide_kwargs": job.trainer.guide_kwargs, + "logger": job.trainer.logger, + "logger_kwargs": job.trainer.logger_kwargs, + "eval_kwargs": job.task.eval_kwargs, + "feedback": feedback or "", + "tb_logdir": tb_rel, + } + write_job_meta(job_artifacts.job_meta, job_meta) + append_results_csv(self.artifacts.results_csv, RESULT_COLUMNS, row) + append_event(job_artifacts.events_jsonl, row) + write_job_results(job_artifacts.results_json, row) + return row + + +__all__ = ["BenchRunner", "RunSummary"] diff --git a/trace_bench/tasks.py b/trace_bench/tasks.py new file mode 100644 index 0000000..4013d2f --- /dev/null +++ b/trace_bench/tasks.py @@ -0,0 +1,5 @@ +"""Backward-compatible task helpers. Use trace_bench.registry instead.""" + +from .registry import discover_tasks, load_task_bundle, load_task_module, TaskSpec + +__all__ = ["discover_tasks", "load_task_bundle", "load_task_module", "TaskSpec"] diff --git a/trace_bench/ui.py b/trace_bench/ui.py new file mode 100644 index 0000000..f2090e6 --- /dev/null +++ b/trace_bench/ui.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from pathlib import Path +import csv +import json + + +def _read_text(path: Path) -> str: + try: + return path.read_text(encoding="utf-8") + except Exception: + return "" + + +def _read_csv(path: Path): + if not path.exists(): + return [] + with path.open("r", encoding="utf-8") as f: + reader = csv.DictReader(f) + return list(reader) + + +def launch_ui(runs_dir: str) -> int: + try: + import gradio as gr + except Exception: + print("Gradio is not installed. Install with: pip install gradio") + return 1 + + runs_root = Path(runs_dir) + runs = sorted([p.name for p in runs_root.iterdir() if p.is_dir()]) if runs_root.exists() else [] + + def load_run(run_id: str): + run_path = runs_root / run_id + config_text = _read_text(run_path / "meta" / "config.snapshot.yaml") + results = _read_csv(run_path / "results.csv") + env_text = _read_text(run_path / "meta" / "env.json") + return config_text, results, env_text + + with gr.Blocks() as demo: + gr.Markdown("# Trace-Bench UI (Stub)") + gr.Markdown("Select a run to view config, results, and env info.") + run_selector = gr.Dropdown(choices=runs, label="Run ID") + config_box = gr.Code(label="config.snapshot.yaml", language="yaml") + results_df = gr.Dataframe(label="results.csv") + env_box = gr.Code(label="env.json", language="json") + + run_selector.change(load_run, inputs=run_selector, outputs=[config_box, results_df, env_box]) + + try: + import mlflow # noqa: F401 + gr.Markdown("MLflow detected. Full integration is pending (M3).") + except Exception: + gr.Markdown("MLflow not installed. Install if you want UI-linked runs.") + + demo.launch() + return 0 + + +__all__ = ["launch_ui"] From f2858e5fe450e71c487826c6a348707759057217 Mon Sep 17 00:00:00 2001 From: Asad Date: Tue, 10 Feb 2026 15:09:14 +0500 Subject: [PATCH 2/8] notebook: use OPENROUTER_API_KEY --- notebooks/01_m1_minimal_api.ipynb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/notebooks/01_m1_minimal_api.ipynb b/notebooks/01_m1_minimal_api.ipynb index 888d492..585e54f 100644 --- a/notebooks/01_m1_minimal_api.ipynb +++ b/notebooks/01_m1_minimal_api.ipynb @@ -81,23 +81,23 @@ "print(\"Runs dir:\", RUNS_DIR)\n", "\n", "# --- Auto-detect API key (real mode by default) ---\n", - "API_KEY = os.environ.get(\"OPENAI_API_KEY\", \"\")\n", + "API_KEY = os.environ.get(\"OPENROUTER_API_KEY\", \"\")\n", "if not API_KEY:\n", " try:\n", " from google.colab import userdata\n", - " API_KEY = userdata.get(\"OPENAI_API_KEY\") or \"\"\n", + " API_KEY = userdata.get(\"OPENROUTER_API_KEY\") or \"\"\n", " except Exception:\n", " pass\n", "\n", "if API_KEY:\n", - " os.environ[\"OPENAI_API_KEY\"] = API_KEY\n", + " os.environ[\"OPENROUTER_API_KEY\"] = API_KEY\n", " os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n", - " os.environ[\"TRACE_LITELLM_MODEL\"] = \"gpt-4o-mini\"\n", + " os.environ[\"TRACE_LITELLM_MODEL\"] = \"openrouter/openai/gpt-4o-mini\"\n", " MODE = \"real\"\n", " print(f\"API key found — running in REAL mode (model: gpt-4o-mini)\")\n", "else:\n", " MODE = \"stub\"\n", - " print(\"WARNING: No OPENAI_API_KEY found. Falling back to STUB mode.\")\n", + " print(\"WARNING: No OPENROUTER_API_KEY found. Falling back to STUB mode.\")\n", " print(\" All outputs below are labeled STUB — not real LLM results.\")\n", "\n", "os.environ[\"TB_MODE\"] = MODE\n", @@ -1541,4 +1541,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} From 8374498095ec7a904b3954283de3f86ee400d7fe Mon Sep 17 00:00:00 2001 From: Asad Date: Wed, 11 Feb 2026 10:13:57 +0500 Subject: [PATCH 3/8] m1: align validation, veribench skip, and trainer discovery --- .gitignore | 1 + README.md | 2 +- notebooks/01_m1_minimal_api.ipynb | 3047 +++++++++++++++-------------- tests/m1/test_threads_mapping.py | 32 + tests/m1/test_veribench_cli.py | 22 +- trace_bench/artifacts.py | 2 +- trace_bench/cli.py | 93 +- trace_bench/config.py | 7 +- trace_bench/matrix.py | 3 +- trace_bench/registry.py | 101 +- trace_bench/resolve.py | 95 + trace_bench/results.py | 16 +- trace_bench/runner.py | 172 +- 13 files changed, 1937 insertions(+), 1656 deletions(-) create mode 100644 tests/m1/test_threads_mapping.py create mode 100644 trace_bench/resolve.py diff --git a/.gitignore b/.gitignore index 074e707..9fdd1f6 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ external/* runs/ runs_test/ notebooks/01_smoke_runner_with_output.ipynb +notebooks/01_m1_minimal_api_with_output.ipynb diff --git a/README.md b/README.md index c49779f..bf586a2 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Without strict mode, the smoke test skips only when optional deps are missing. ## VeriBench Status (In Scope, Pending Input) VeriBench is in scope but requires the Trace team to provide the task entrypoint/task list. -CLI flags are ready (`--bench veribench`), and will raise a clear `NotImplementedError` until the entrypoint is provided. +CLI flags are ready (`--bench veribench`); when the entrypoint is unavailable, tasks are skipped with a structured reason rather than raising. ## Problem Sets diff --git a/notebooks/01_m1_minimal_api.ipynb b/notebooks/01_m1_minimal_api.ipynb index 585e54f..4d8670c 100644 --- a/notebooks/01_m1_minimal_api.ipynb +++ b/notebooks/01_m1_minimal_api.ipynb @@ -1,1544 +1,1545 @@ { - "cells": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "euYNX4m-m0Ty" + }, + "source": [ + "# Trace-Bench M1 \u2014 Minimal API Validation\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guru-code-expert/Trace-Bench/blob/m1/deliverable/notebooks/01_m1_minimal_api.ipynb)\n", + "\n", + "This notebook validates the **M1 contracts**: canonical artifacts, deterministic IDs, and minimal runnable coverage across benches.\n", + "\n", + "**Mode policy**: defaults to **real** (uses API key if present). If no key is found, falls back to **stub** with a clear warning and STUB label on outputs." + ], + "id": "euYNX4m-m0Ty" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u5DVjcAAm0UH" + }, + "source": [ + "## Expected Outputs\n", + "\n", + "- A new `runs//` folder with `meta/` + `jobs/` layout.\n", + "- `meta/config.snapshot.yaml`, `meta/manifest.json`, `meta/env.json` exist.\n", + "- `results.csv` contains `status` values (`ok`/`failed`/`skipped`).\n", + "- Internal non-trainable job shows `status=failed` with reason.\n", + "- If running in **real** mode, artifacts show `mode=real` and LLM4AD task produces a score.\n", + "- **2x2 matrix smoke**: `results.csv` with exactly 4 rows from 2 tasks x 2 trainers x 1 seed." + ], + "id": "u5DVjcAAm0UH" + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "8D3DGyVXm0UJ", + "outputId": "aadad0ba-037c-4ffc-8d5a-4c55fb9d0d3f", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "euYNX4m-m0Ty" - }, - "source": [ - "# Trace-Bench M1 — Minimal API Validation\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guru-code-expert/Trace-Bench/blob/runner-foundation/notebooks/01_m1_minimal_api.ipynb)\n", - "\n", - "This notebook validates the **M1 contracts**: canonical artifacts, deterministic IDs, and minimal runnable coverage across benches.\n", - "\n", - "**Mode policy**: defaults to **real** (uses API key if present). If no key is found, falls back to **stub** with a clear warning and STUB label on outputs." - ], - "id": "euYNX4m-m0Ty" - }, + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n", + "Runs dir: /content/drive/MyDrive/bench/2026-02-09/trace_bench\n", + "API key found \u2014 running in REAL mode (model: gpt-4o-mini)\n", + "\n", + "Mode: real\n" + ] + } + ], + "source": [ + "# Mount Drive (optional) + compute persistent runs_dir + detect API key\n", + "from datetime import date\n", + "from pathlib import Path\n", + "import os\n", + "\n", + "try:\n", + " from google.colab import drive\n", + " drive.mount(\"/content/drive\")\n", + "except Exception:\n", + " pass\n", + "\n", + "\n", + "def bench_dir(project=\"bench\", sub=\"trace_bench\", local=\"/content/bench\"):\n", + " drive_root = Path(\"/content/drive/MyDrive\")\n", + " root = drive_root if drive_root.is_dir() else Path(local)\n", + " out = root / project / date.today().isoformat() / sub\n", + " out.mkdir(parents=True, exist_ok=True)\n", + " return str(out)\n", + "\n", + "RUNS_DIR = bench_dir()\n", + "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n", + "print(\"Runs dir:\", RUNS_DIR)\n", + "\n", + "# --- Auto-detect API key (real mode by default) ---\n", + "API_KEY = os.environ.get(\"OPENROUTER_API_KEY\", \"\")\n", + "if not API_KEY:\n", + " try:\n", + " from google.colab import userdata\n", + " API_KEY = userdata.get(\"OPENROUTER_API_KEY\") or \"\"\n", + " except Exception:\n", + " pass\n", + "\n", + "if API_KEY:\n", + " os.environ[\"OPENROUTER_API_KEY\"] = API_KEY\n", + " os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n", + " os.environ[\"TRACE_LITELLM_MODEL\"] = \"openrouter/openai/gpt-4o-mini\"\n", + " MODE = \"real\"\n", + " print(f\"API key found \u2014 running in REAL mode (model: gpt-4o-mini)\")\n", + "else:\n", + " MODE = \"stub\"\n", + " print(\"WARNING: No OPENROUTER_API_KEY found. Falling back to STUB mode.\")\n", + " print(\" All outputs below are labeled STUB \u2014 not real LLM results.\")\n", + "\n", + "os.environ[\"TB_MODE\"] = MODE\n", + "print(f\"\\nMode: {MODE}\")" + ], + "id": "8D3DGyVXm0UJ" + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "swOi3Bhtm0UQ", + "outputId": "e9806308-35f8-48c5-e6b2-e5f46530a497", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "u5DVjcAAm0UH" - }, - "source": [ - "## Expected Outputs\n", - "\n", - "- A new `runs//` folder with `meta/` + `jobs/` layout.\n", - "- `meta/config.snapshot.yaml`, `meta/manifest.json`, `meta/env.json` exist.\n", - "- `results.csv` contains `status` values (`ok`/`failed`/`skipped`).\n", - "- Internal non-trainable job shows `status=failed` with reason.\n", - "- If running in **real** mode, artifacts show `mode=real` and LLM4AD task produces a score.\n", - "- **2x2 matrix smoke**: `results.csv` with exactly 4 rows from 2 tasks x 2 trainers x 1 seed." - ], - "id": "u5DVjcAAm0UH" - }, + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'Trace-Bench'...\n", + "remote: Enumerating objects: 315, done.\u001b[K\n", + "remote: Counting objects: 100% (315/315), done.\u001b[K\n", + "remote: Compressing objects: 100% (222/222), done.\u001b[K\n", + "remote: Total 315 (delta 42), reused 274 (delta 36), pack-reused 0 (from 0)\u001b[K\n", + "Receiving objects: 100% (315/315), 3.86 MiB | 8.12 MiB/s, done.\n", + "Resolving deltas: 100% (42/42), done.\n", + "Cloning into 'OpenTrace'...\n", + "remote: Enumerating objects: 228, done.\u001b[K\n", + "remote: Counting objects: 100% (228/228), done.\u001b[K\n", + "remote: Compressing objects: 100% (205/205), done.\u001b[K\n", + "remote: Total 228 (delta 17), reused 114 (delta 13), pack-reused 0 (from 0)\u001b[K\n", + "Receiving objects: 100% (228/228), 4.73 MiB | 14.77 MiB/s, done.\n", + "Resolving deltas: 100% (17/17), done.\n", + "/content/Trace-Bench\n", + "Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n", + "Get:2 https://cli.github.com/packages stable InRelease [3,917 B]\n", + "Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [85.0 kB]\n", + "Get:4 https://cli.github.com/packages stable/main amd64 Packages [356 B]\n", + "Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease\n", + "Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n", + "Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n", + "Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n", + "Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,893 kB]\n", + "Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]\n", + "Get:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]\n", + "Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n", + "Get:13 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [6,396 kB]\n", + "Get:14 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [38.8 kB]\n", + "Get:15 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [6,661 kB]\n", + "Get:16 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy/main amd64 Packages [75.3 kB]\n", + "Get:17 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,716 kB]\n", + "Get:18 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,683 kB]\n", + "Get:19 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [62.6 kB]\n", + "Get:20 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,297 kB]\n", + "Get:21 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [70.9 kB]\n", + "Get:22 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [4,035 kB]\n", + "Get:23 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,609 kB]\n", + "Fetched 37.1 MB in 6s (6,435 kB/s)\n", + "Reading package lists... Done\n", + "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n", + "Reading package lists... Done\n", + "Building dependency tree... Done\n", + "Reading state information... Done\n", + "graphviz is already the newest version (2.42.2-6ubuntu0.1).\n", + "0 upgraded, 0 newly installed, 0 to remove and 55 not upgraded.\n", + "Requirement already satisfied: pip in /usr/local/lib/python3.12/dist-packages (24.1.2)\n", + "Collecting pip\n", + " Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)\n", + "Downloading pip-26.0.1-py3-none-any.whl (1.8 MB)\n", + "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: pip\n", + " Attempting uninstall: pip\n", + " Found existing installation: pip 24.1.2\n", + " Uninstalling pip-24.1.2:\n", + " Successfully uninstalled pip-24.1.2\n", + "Successfully installed pip-26.0.1\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (6.0.3)\n", + "Requirement already satisfied: pytest in /usr/local/lib/python3.12/dist-packages (8.4.2)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", + "Requirement already satisfied: graphviz in /usr/local/lib/python3.12/dist-packages (0.21)\n", + "Collecting litellm==1.75.0\n", + " Downloading litellm-1.75.0-py3-none-any.whl.metadata (40 kB)\n", + "Requirement already satisfied: aiohttp>=3.10 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.13.3)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.3.1)\n", + "Requirement already satisfied: httpx>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.28.1)\n", + "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.7.1)\n", + "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.1.6)\n", + "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (4.26.0)\n", + "Requirement already satisfied: openai>=1.68.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.16.0)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.12.3)\n", + "Requirement already satisfied: python-dotenv>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (1.2.1)\n", + "Requirement already satisfied: tiktoken>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.12.0)\n", + "Requirement already satisfied: tokenizers in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.22.2)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2<4.0.0,>=3.1.2->litellm==1.75.0) (3.0.3)\n", + "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (25.4.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (2025.9.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.37.0)\n", + "Requirement already satisfied: rpds-py>=0.25.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.30.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (2.41.4)\n", + "Requirement already satisfied: typing-extensions>=4.14.1 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (4.15.0)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.4.2)\n", + "Requirement already satisfied: iniconfig>=1 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.3.0)\n", + "Requirement already satisfied: packaging>=20 in /usr/local/lib/python3.12/dist-packages (from pytest) (26.0)\n", + "Requirement already satisfied: pluggy<2,>=1.5 in /usr/local/lib/python3.12/dist-packages (from pytest) (1.6.0)\n", + "Requirement already satisfied: pygments>=2.7.2 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.19.2)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", + "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (2.9.0.post0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.4.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.8.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (6.7.1)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (0.4.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.22.0)\n", + "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.12/dist-packages (from yarl<2.0,>=1.17.0->aiohttp>=3.10->litellm==1.75.0) (3.11)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (4.12.1)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (2026.1.4)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (1.0.9)\n", + "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx>=0.23.0->litellm==1.75.0) (0.16.0)\n", + "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.12/dist-packages (from importlib-metadata>=6.8.0->litellm==1.75.0) (3.23.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.9.0)\n", + "Requirement already satisfied: jiter<1,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (0.13.0)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.3.1)\n", + "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (4.67.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n", + "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2025.11.3)\n", + "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2.32.4)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (3.4.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (2.5.0)\n", + "Requirement already satisfied: huggingface-hub<2.0,>=0.16.4 in /usr/local/lib/python3.12/dist-packages (from tokenizers->litellm==1.75.0) (1.3.7)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (3.20.3)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (2025.3.0)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.2.0)\n", + "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.5.4)\n", + "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (0.21.1)\n", + "Downloading litellm-1.75.0-py3-none-any.whl (8.9 MB)\n", + "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m81.9 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: litellm\n", + "Successfully installed litellm-1.75.0\n" + ] + } + ], + "source": [ + "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n", + "!git clone --depth 1 --branch runner-foundation https://github.com/guru-code-expert/Trace-Bench.git\n", + "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n", + "\n", + "%cd Trace-Bench\n", + "\n", + "# System + Python deps\n", + "!apt-get update -y && apt-get install -y graphviz\n", + "!python -m pip install -U pip\n", + "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0" + ], + "id": "swOi3Bhtm0UQ" + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "a__iRJTHm0UR", + "outputId": "f48aba86-b779-4537-f5ce-8d5b2bdc4154", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "8D3DGyVXm0UJ", - "outputId": "aadad0ba-037c-4ffc-8d5a-4c55fb9d0d3f", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Mounted at /content/drive\n", - "Runs dir: /content/drive/MyDrive/bench/2026-02-09/trace_bench\n", - "API key found — running in REAL mode (model: gpt-4o-mini)\n", - "\n", - "Mode: real\n" - ] - } - ], - "source": [ - "# Mount Drive (optional) + compute persistent runs_dir + detect API key\n", - "from datetime import date\n", - "from pathlib import Path\n", - "import os\n", - "\n", - "try:\n", - " from google.colab import drive\n", - " drive.mount(\"/content/drive\")\n", - "except Exception:\n", - " pass\n", - "\n", - "\n", - "def bench_dir(project=\"bench\", sub=\"trace_bench\", local=\"/content/bench\"):\n", - " drive_root = Path(\"/content/drive/MyDrive\")\n", - " root = drive_root if drive_root.is_dir() else Path(local)\n", - " out = root / project / date.today().isoformat() / sub\n", - " out.mkdir(parents=True, exist_ok=True)\n", - " return str(out)\n", - "\n", - "RUNS_DIR = bench_dir()\n", - "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n", - "print(\"Runs dir:\", RUNS_DIR)\n", - "\n", - "# --- Auto-detect API key (real mode by default) ---\n", - "API_KEY = os.environ.get(\"OPENROUTER_API_KEY\", \"\")\n", - "if not API_KEY:\n", - " try:\n", - " from google.colab import userdata\n", - " API_KEY = userdata.get(\"OPENROUTER_API_KEY\") or \"\"\n", - " except Exception:\n", - " pass\n", - "\n", - "if API_KEY:\n", - " os.environ[\"OPENROUTER_API_KEY\"] = API_KEY\n", - " os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n", - " os.environ[\"TRACE_LITELLM_MODEL\"] = \"openrouter/openai/gpt-4o-mini\"\n", - " MODE = \"real\"\n", - " print(f\"API key found — running in REAL mode (model: gpt-4o-mini)\")\n", - "else:\n", - " MODE = \"stub\"\n", - " print(\"WARNING: No OPENROUTER_API_KEY found. Falling back to STUB mode.\")\n", - " print(\" All outputs below are labeled STUB — not real LLM results.\")\n", - "\n", - "os.environ[\"TB_MODE\"] = MODE\n", - "print(f\"\\nMode: {MODE}\")" - ], - "id": "8D3DGyVXm0UJ" + "output_type": "stream", + "name": "stdout", + "text": [ + "=== List trainers ===\n", + "PrioritySearch\tavailable\n", + "GEPA-Base\tavailable\n", + "GEPA-UCB\tavailable\n", + "GEPA-Beam\tavailable\n", + "\n", + "=== Validate config (strict) ===\n", + "[OK] internal:code_param\n", + "[OK] internal:numeric_param\n", + "[OK] internal:multi_param\n", + "[OK] internal:non_trainable\n", + "[EXPECTED] internal:non_trainable: no_trainable_parameters\n", + "[OK] trace_examples:greeting_stub\n", + "[OK] llm4ad:circle_packing\n", + "[SKIP] veribench:smoke_placeholder: VeriBench tasks not yet wired: awaiting Trace team entrypoint/task list.\n", + "\n", + "[OK] matrix: 28 jobs expanded deterministically\n", + " job 6f3619dd9ae0: internal:code_param x PrioritySearch (seed=123)\n", + " job c486ba93400f: internal:code_param x GEPA-Base (seed=123)\n", + " job a84d2486d31a: internal:code_param x GEPA-UCB (seed=123)\n", + " job 8ecff95cfafa: internal:code_param x GEPA-Beam (seed=123)\n", + " job 778da61d2682: internal:numeric_param x PrioritySearch (seed=123)\n", + " job 4b3a7f322126: internal:numeric_param x GEPA-Base (seed=123)\n", + " job 4b9c7d66d866: internal:numeric_param x GEPA-UCB (seed=123)\n", + " job 54df742bb5e9: internal:numeric_param x GEPA-Beam (seed=123)\n", + " job 0bfef35f6ef3: internal:multi_param x PrioritySearch (seed=123)\n", + " job e06adbe6489b: internal:multi_param x GEPA-Base (seed=123)\n", + " job 8669d9b963d4: internal:multi_param x GEPA-UCB (seed=123)\n", + " job 90d23f88baf7: internal:multi_param x GEPA-Beam (seed=123)\n", + " job d6aa82e5d119: internal:non_trainable x PrioritySearch (seed=123)\n", + " job 4f655637a6dc: internal:non_trainable x GEPA-Base (seed=123)\n", + " job 85940a1b71e7: internal:non_trainable x GEPA-UCB (seed=123)\n", + " job dafcec9c13af: internal:non_trainable x GEPA-Beam (seed=123)\n", + " job e8e9938a4ef6: trace_examples:greeting_stub x PrioritySearch (seed=123)\n", + " job 4715e211f8a9: trace_examples:greeting_stub x GEPA-Base (seed=123)\n", + " job 8c4ec9f3e355: trace_examples:greeting_stub x GEPA-UCB (seed=123)\n", + " job 2f84751a35ad: trace_examples:greeting_stub x GEPA-Beam (seed=123)\n", + " job da0e8ae694f1: llm4ad:circle_packing x PrioritySearch (seed=123)\n", + " job 0865599891de: llm4ad:circle_packing x GEPA-Base (seed=123)\n", + " job d25dcdb59892: llm4ad:circle_packing x GEPA-UCB (seed=123)\n", + " job d985faad90f4: llm4ad:circle_packing x GEPA-Beam (seed=123)\n", + " job 364d89b28934: veribench:smoke_placeholder x PrioritySearch (seed=123)\n", + " job 721282ed015b: veribench:smoke_placeholder x GEPA-Base (seed=123)\n", + " job 5b657b995d7a: veribench:smoke_placeholder x GEPA-UCB (seed=123)\n", + " job 77b3e4cb5bf0: veribench:smoke_placeholder x GEPA-Beam (seed=123)\n", + "\n", + " tasks: ['internal:code_param', 'internal:multi_param', 'internal:non_trainable', 'internal:numeric_param', 'llm4ad:circle_packing', 'trace_examples:greeting_stub', 'veribench:smoke_placeholder']\n", + " trainers: ['GEPA-Base', 'GEPA-Beam', 'GEPA-UCB', 'PrioritySearch']\n", + "[OK] manifest written: runs/20260209-153344-8f7a72b4/meta/manifest.json\n", + "\n", + "=== Generate M1 run config (mode=real) ===\n", + "Config mode: real\n", + "\n", + "=== Run M1 validation ===\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: 1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: 1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n", + " return code\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 1\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 2\n", + "[Step 1] Update/best_candidate_priority: 1.0\n", + "[Step 1] Update/best_candidate_mean_score: 1.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 1\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 1.0\n", + "[Step 1] Sample/num_samples: 1\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 3\u001b[0m\n", + "[Step 1] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n", + " return code\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -3.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -3.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 3\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:2: 1.0\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:3: 1.0\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", + " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -0.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:2: 1.5\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:3: 1.5\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", + " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1000000.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1000000.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code:3: import numpy as np\n", + "import math\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " grid_size = int(np.ceil(np.sqrt(n)))\n", + " radius = 0.5 / grid_size\n", + "\n", + " circles = []\n", + " for i in range(n):\n", + " row = i // grid_size\n", + " col = i % grid_size\n", + " x = (col + 0.5) / grid_size\n", + " y = (row + 0.5) / grid_size\n", + " circles.append([x, y, radius])\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.375582371483138\n", + "[Step 1] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 1.375582371483138\n", + "[Step 1] Update/best_candidate_mean_score: 1.375582371483138\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.0407921408122753\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.0407921408122753\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: -1000000.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code:3: import numpy as np\n", + "import random\n", + "\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " random.seed(2025)\n", + " np.random.seed(2025)\n", + "\n", + " circles = []\n", + " radii = np.random.uniform(0.01, 0.1, size=n) # Random radii between 0.01 and 0.1\n", + "\n", + " for _ in range(n):\n", + " placed = False\n", + " while not placed:\n", + " radius = np.random.choice(radii)\n", + " x = np.random.uniform(radius, 1 - radius)\n", + " y = np.random.uniform(radius, 1 - radius)\n", + " overlap = False\n", + " \n", + " # Check for overlap\n", + " for circle in circles:\n", + " if np.sqrt((circle[0] - x) ** 2 + (circle[1] - y) ** 2) < (circle[2] + radius):\n", + " overlap = True\n", + " break\n", + " \n", + " if not overlap:\n", + " circles.append([x, y, radius])\n", + " placed = True\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: -1000000.0\u001b[0m\n" + ] }, { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "swOi3Bhtm0UQ", - "outputId": "e9806308-35f8-48c5-e6b2-e5f46530a497", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Cloning into 'Trace-Bench'...\n", - "remote: Enumerating objects: 315, done.\u001b[K\n", - "remote: Counting objects: 100% (315/315), done.\u001b[K\n", - "remote: Compressing objects: 100% (222/222), done.\u001b[K\n", - "remote: Total 315 (delta 42), reused 274 (delta 36), pack-reused 0 (from 0)\u001b[K\n", - "Receiving objects: 100% (315/315), 3.86 MiB | 8.12 MiB/s, done.\n", - "Resolving deltas: 100% (42/42), done.\n", - "Cloning into 'OpenTrace'...\n", - "remote: Enumerating objects: 228, done.\u001b[K\n", - "remote: Counting objects: 100% (228/228), done.\u001b[K\n", - "remote: Compressing objects: 100% (205/205), done.\u001b[K\n", - "remote: Total 228 (delta 17), reused 114 (delta 13), pack-reused 0 (from 0)\u001b[K\n", - "Receiving objects: 100% (228/228), 4.73 MiB | 14.77 MiB/s, done.\n", - "Resolving deltas: 100% (17/17), done.\n", - "/content/Trace-Bench\n", - "Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n", - "Get:2 https://cli.github.com/packages stable InRelease [3,917 B]\n", - "Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [85.0 kB]\n", - "Get:4 https://cli.github.com/packages stable/main amd64 Packages [356 B]\n", - "Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease\n", - "Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n", - "Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n", - "Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n", - "Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,893 kB]\n", - "Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]\n", - "Get:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]\n", - "Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n", - "Get:13 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [6,396 kB]\n", - "Get:14 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [38.8 kB]\n", - "Get:15 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [6,661 kB]\n", - "Get:16 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy/main amd64 Packages [75.3 kB]\n", - "Get:17 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,716 kB]\n", - "Get:18 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,683 kB]\n", - "Get:19 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [62.6 kB]\n", - "Get:20 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,297 kB]\n", - "Get:21 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [70.9 kB]\n", - "Get:22 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [4,035 kB]\n", - "Get:23 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,609 kB]\n", - "Fetched 37.1 MB in 6s (6,435 kB/s)\n", - "Reading package lists... Done\n", - "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n", - "Reading package lists... Done\n", - "Building dependency tree... Done\n", - "Reading state information... Done\n", - "graphviz is already the newest version (2.42.2-6ubuntu0.1).\n", - "0 upgraded, 0 newly installed, 0 to remove and 55 not upgraded.\n", - "Requirement already satisfied: pip in /usr/local/lib/python3.12/dist-packages (24.1.2)\n", - "Collecting pip\n", - " Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)\n", - "Downloading pip-26.0.1-py3-none-any.whl (1.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: pip\n", - " Attempting uninstall: pip\n", - " Found existing installation: pip 24.1.2\n", - " Uninstalling pip-24.1.2:\n", - " Successfully uninstalled pip-24.1.2\n", - "Successfully installed pip-26.0.1\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (6.0.3)\n", - "Requirement already satisfied: pytest in /usr/local/lib/python3.12/dist-packages (8.4.2)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", - "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", - "Requirement already satisfied: graphviz in /usr/local/lib/python3.12/dist-packages (0.21)\n", - "Collecting litellm==1.75.0\n", - " Downloading litellm-1.75.0-py3-none-any.whl.metadata (40 kB)\n", - "Requirement already satisfied: aiohttp>=3.10 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.13.3)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.3.1)\n", - "Requirement already satisfied: httpx>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.28.1)\n", - "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.7.1)\n", - "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.1.6)\n", - "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (4.26.0)\n", - "Requirement already satisfied: openai>=1.68.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.16.0)\n", - "Requirement already satisfied: pydantic<3.0.0,>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.12.3)\n", - "Requirement already satisfied: python-dotenv>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (1.2.1)\n", - "Requirement already satisfied: tiktoken>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.12.0)\n", - "Requirement already satisfied: tokenizers in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.22.2)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2<4.0.0,>=3.1.2->litellm==1.75.0) (3.0.3)\n", - "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (25.4.0)\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (2025.9.1)\n", - "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.37.0)\n", - "Requirement already satisfied: rpds-py>=0.25.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.30.0)\n", - "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (2.41.4)\n", - "Requirement already satisfied: typing-extensions>=4.14.1 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (4.15.0)\n", - "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.4.2)\n", - "Requirement already satisfied: iniconfig>=1 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.3.0)\n", - "Requirement already satisfied: packaging>=20 in /usr/local/lib/python3.12/dist-packages (from pytest) (26.0)\n", - "Requirement already satisfied: pluggy<2,>=1.5 in /usr/local/lib/python3.12/dist-packages (from pytest) (1.6.0)\n", - "Requirement already satisfied: pygments>=2.7.2 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.19.2)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", - "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", - "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", - "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (2.9.0.post0)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (2.6.1)\n", - "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.4.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.8.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (6.7.1)\n", - "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (0.4.1)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.22.0)\n", - "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.12/dist-packages (from yarl<2.0,>=1.17.0->aiohttp>=3.10->litellm==1.75.0) (3.11)\n", - "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (4.12.1)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (2026.1.4)\n", - "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (1.0.9)\n", - "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx>=0.23.0->litellm==1.75.0) (0.16.0)\n", - "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.12/dist-packages (from importlib-metadata>=6.8.0->litellm==1.75.0) (3.23.0)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.9.0)\n", - "Requirement already satisfied: jiter<1,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (0.13.0)\n", - "Requirement already satisfied: sniffio in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.3.1)\n", - "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (4.67.2)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n", - "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2025.11.3)\n", - "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2.32.4)\n", - "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (3.4.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (2.5.0)\n", - "Requirement already satisfied: huggingface-hub<2.0,>=0.16.4 in /usr/local/lib/python3.12/dist-packages (from tokenizers->litellm==1.75.0) (1.3.7)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (3.20.3)\n", - "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (2025.3.0)\n", - "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.2.0)\n", - "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.5.4)\n", - "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (0.21.1)\n", - "Downloading litellm-1.75.0-py3-none-any.whl (8.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m81.9 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: litellm\n", - "Successfully installed litellm-1.75.0\n" - ] - } - ], - "source": [ - "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n", - "!git clone --depth 1 --branch runner-foundation https://github.com/guru-code-expert/Trace-Bench.git\n", - "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n", - "\n", - "%cd Trace-Bench\n", - "\n", - "# System + Python deps\n", - "!apt-get update -y && apt-get install -y graphviz\n", - "!python -m pip install -U pip\n", - "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0" - ], - "id": "swOi3Bhtm0UQ" - }, + "output_type": "stream", + "name": "stderr", + "text": [ + "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_run.yaml < np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - " \n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - " \n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - "\n", - " grid_size = int(np.ceil(np.sqrt(n)))\n", - " radius = 0.5 / grid_size\n", - "\n", - " circles = []\n", - " for i in range(n):\n", - " row = i // grid_size\n", - " col = i % grid_size\n", - " x = (col + 0.5) / grid_size\n", - " y = (row + 0.5) / grid_size\n", - " circles.append([x, y, radius])\n", - "\n", - " return np.array(circles)\u001b[0m\n", - "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: 1.375582371483138\n", - "[Step 1] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", - "[Step 1] Update/n_iters: 1\n", - "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 5\n", - "[Step 1] Update/using_short_term_memory: False\n", - "[Step 1] Update/using_long_term_memory: True\n", - "[Step 1] Update/total_samples: 6\n", - "[Step 1] Update/best_candidate_priority: 1.375582371483138\n", - "[Step 1] Update/best_candidate_mean_score: 1.375582371483138\n", - "[Step 1] Update/best_candidate_num_rollouts: 1\n", - "[Step 1] Update/num_exploration_candidates: 2\n", - "[Step 1] Update/exploration_candidates_mean_priority: 1.0407921408122753\n", - "[Step 1] Update/exploration_candidates_mean_score: 1.0407921408122753\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", - "[Step 1] Sample/mean_score: -1000000.0\n", - "[Step 1] Sample/num_samples: 2\n", - "[Step 1] Sample/self.n_epochs: 1\n", - "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code:3: import numpy as np\n", - "import random\n", - "\n", - "def pack_circles(n: int) -> np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - " \n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - " \n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - "\n", - " random.seed(2025)\n", - " np.random.seed(2025)\n", - "\n", - " circles = []\n", - " radii = np.random.uniform(0.01, 0.1, size=n) # Random radii between 0.01 and 0.1\n", - "\n", - " for _ in range(n):\n", - " placed = False\n", - " while not placed:\n", - " radius = np.random.choice(radii)\n", - " x = np.random.uniform(radius, 1 - radius)\n", - " y = np.random.uniform(radius, 1 - radius)\n", - " overlap = False\n", - " \n", - " # Check for overlap\n", - " for circle in circles:\n", - " if np.sqrt((circle[0] - x) ** 2 + (circle[1] - y) ** 2) < (circle[2] + radius):\n", - " overlap = True\n", - " break\n", - " \n", - " if not overlap:\n", - " circles.append([x, y, radius])\n", - " placed = True\n", - "\n", - " return np.array(circles)\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: -1000000.0\u001b[0m\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_run.yaml <\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
run_idjob_idtask_idsuitetrainer_idseedstatusscore_initialscore_finalscore_besttime_secondsresolved_trainer_kwargsresolved_optimizer_kwargseval_kwargsfeedbacktb_logdir
020260209-153346-0daa4bb96f3619dd9ae0internal:code_paraminternalPrioritySearch123ok1.01.01.010.507114{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/6f3619dd9ae0/tb
120260209-153346-0daa4bb9c486ba93400finternal:code_paraminternalGEPA-Base123ok1.01.01.01.279633{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/c486ba93400f/tb
220260209-153346-0daa4bb9778da61d2682internal:numeric_paraminternalPrioritySearch123ok-3.0-0.0-0.04.215786{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/778da61d2682/tb
320260209-153346-0daa4bb94b3a7f322126internal:numeric_paraminternalGEPA-Base123ok-3.0-0.0-0.03.031100{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/4b3a7f322126/tb
420260209-153346-0daa4bb90bfef35f6ef3internal:multi_paraminternalPrioritySearch123ok-1.0-0.0-0.03.620341{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Make a+b matc...{\"timeout_seconds\": 10}target=3.0jobs/0bfef35f6ef3/tb
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "df", - "summary": "{\n \"name\": \"df\",\n \"rows\": 12,\n \"fields\": [\n {\n \"column\": \"run_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"20260209-153346-0daa4bb9\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"job_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"364d89b28934\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"internal:code_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_initial\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 462909.5869786947,\n \"min\": -1000000.0,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_final\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 353553.5610863874,\n \"min\": -1000000.0,\n \"max\": 1.375582371483138,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 353553.5610863874,\n \"min\": -1000000.0,\n \"max\": 1.375582371483138,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"time_seconds\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8.86582048810776,\n \"min\": 3.5e-05,\n \"max\": 28.849823,\n \"num_unique_values\": 12,\n \"samples\": [\n 4.2e-05\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_trainer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"{\\\"merge_every\\\": 2, \\\"num_iters\\\": 1, \\\"pareto_subset_size\\\": 2, \\\"train_batch_size\\\": 2}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_optimizer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"{\\\"memory_size\\\": 5, \\\"objective\\\": \\\"Match the numeric target value.\\\"}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"eval_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"{\\\"timeout_seconds\\\": 10}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"feedback\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Correct\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tb_logdir\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"jobs/364d89b28934/tb\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" - } - }, - "metadata": {}, - "execution_count": 4 - } + "output_type": "execute_result", + "data": { + "text/plain": [ + " run_id job_id task_id suite \\\n", + "0 20260209-153346-0daa4bb9 6f3619dd9ae0 internal:code_param internal \n", + "1 20260209-153346-0daa4bb9 c486ba93400f internal:code_param internal \n", + "2 20260209-153346-0daa4bb9 778da61d2682 internal:numeric_param internal \n", + "3 20260209-153346-0daa4bb9 4b3a7f322126 internal:numeric_param internal \n", + "4 20260209-153346-0daa4bb9 0bfef35f6ef3 internal:multi_param internal \n", + "\n", + " trainer_id seed status score_initial score_final score_best \\\n", + "0 PrioritySearch 123 ok 1.0 1.0 1.0 \n", + "1 GEPA-Base 123 ok 1.0 1.0 1.0 \n", + "2 PrioritySearch 123 ok -3.0 -0.0 -0.0 \n", + "3 GEPA-Base 123 ok -3.0 -0.0 -0.0 \n", + "4 PrioritySearch 123 ok -1.0 -0.0 -0.0 \n", + "\n", + " time_seconds resolved_trainer_kwargs \\\n", + "0 10.507114 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", + "1 1.279633 {\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub... \n", + "2 4.215786 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", + "3 3.031100 {\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub... \n", + "4 3.620341 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", + "\n", + " resolved_optimizer_kwargs eval_kwargs \\\n", + "0 {\"memory_size\": 5, \"objective\": \"Match the tar... {\"timeout_seconds\": 10} \n", + "1 {\"memory_size\": 5, \"objective\": \"Match the tar... {\"timeout_seconds\": 10} \n", + "2 {\"memory_size\": 5, \"objective\": \"Match the num... {\"timeout_seconds\": 10} \n", + "3 {\"memory_size\": 5, \"objective\": \"Match the num... {\"timeout_seconds\": 10} \n", + "4 {\"memory_size\": 5, \"objective\": \"Make a+b matc... {\"timeout_seconds\": 10} \n", + "\n", + " feedback tb_logdir \n", + "0 Correct jobs/6f3619dd9ae0/tb \n", + "1 Correct jobs/c486ba93400f/tb \n", + "2 target=3.0 jobs/778da61d2682/tb \n", + "3 target=3.0 jobs/4b3a7f322126/tb \n", + "4 target=3.0 jobs/0bfef35f6ef3/tb " ], - "source": [ - "# Inspect latest run artifacts\n", - "import pathlib, json, pandas as pd\n", - "\n", - "runs_root = pathlib.Path(RUNS_DIR)\n", - "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", - "\n", - "run_dir = None\n", - "for p in reversed(candidates):\n", - " if (p / \"meta\" / \"config.snapshot.yaml\").exists():\n", - " run_dir = p\n", - " break\n", - "\n", - "if run_dir is None:\n", - " for p in reversed(candidates):\n", - " if (p / \"config.snapshot.yaml\").exists():\n", - " run_dir = p\n", - " break\n", - "\n", - "if run_dir is None:\n", - " raise FileNotFoundError(\"No run folder with config snapshot found under RUNS_DIR\")\n", - "\n", - "print(\"Run dir:\", run_dir)\n", - "\n", - "config_path = run_dir / \"meta\" / \"config.snapshot.yaml\"\n", - "env_path = run_dir / \"meta\" / \"env.json\"\n", - "manifest_path = run_dir / \"meta\" / \"manifest.json\"\n", - "\n", - "if not config_path.exists():\n", - " config_path = run_dir / \"config.snapshot.yaml\"\n", - " env_path = run_dir / \"env.json\"\n", - "\n", - "config_text = config_path.read_text()\n", - "print(config_text[:400])\n", - "\n", - "if manifest_path.exists():\n", - " manifest = json.loads(manifest_path.read_text())\n", - " print(\"Jobs in manifest:\", len(manifest.get(\"jobs\", [])))\n", - "\n", - "df = pd.read_csv(run_dir / \"results.csv\")\n", - "df.head()\n" + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
run_idjob_idtask_idsuitetrainer_idseedstatusscore_initialscore_finalscore_besttime_secondsresolved_trainer_kwargsresolved_optimizer_kwargseval_kwargsfeedbacktb_logdir
020260209-153346-0daa4bb96f3619dd9ae0internal:code_paraminternalPrioritySearch123ok1.01.01.010.507114{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/6f3619dd9ae0/tb
120260209-153346-0daa4bb9c486ba93400finternal:code_paraminternalGEPA-Base123ok1.01.01.01.279633{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/c486ba93400f/tb
220260209-153346-0daa4bb9778da61d2682internal:numeric_paraminternalPrioritySearch123ok-3.0-0.0-0.04.215786{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/778da61d2682/tb
320260209-153346-0daa4bb94b3a7f322126internal:numeric_paraminternalGEPA-Base123ok-3.0-0.0-0.03.031100{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/4b3a7f322126/tb
420260209-153346-0daa4bb90bfef35f6ef3internal:multi_paraminternalPrioritySearch123ok-1.0-0.0-0.03.620341{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Make a+b matc...{\"timeout_seconds\": 10}target=3.0jobs/0bfef35f6ef3/tb
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" ], - "id": "ckY1HmQam0UU" - }, + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 12,\n \"fields\": [\n {\n \"column\": \"run_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"20260209-153346-0daa4bb9\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"job_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"364d89b28934\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"internal:code_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_initial\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 462909.5869786947,\n \"min\": -1000000.0,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_final\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 353553.5610863874,\n \"min\": -1000000.0,\n \"max\": 1.375582371483138,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 353553.5610863874,\n \"min\": -1000000.0,\n \"max\": 1.375582371483138,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"time_seconds\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8.86582048810776,\n \"min\": 3.5e-05,\n \"max\": 28.849823,\n \"num_unique_values\": 12,\n \"samples\": [\n 4.2e-05\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_trainer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"{\\\"merge_every\\\": 2, \\\"num_iters\\\": 1, \\\"pareto_subset_size\\\": 2, \\\"train_batch_size\\\": 2}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_optimizer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"{\\\"memory_size\\\": 5, \\\"objective\\\": \\\"Match the numeric target value.\\\"}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"eval_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"{\\\"timeout_seconds\\\": 10}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"feedback\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Correct\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tb_logdir\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"jobs/364d89b28934/tb\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "# Inspect latest run artifacts\n", + "import pathlib, json, pandas as pd\n", + "\n", + "runs_root = pathlib.Path(RUNS_DIR)\n", + "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", + "\n", + "run_dir = None\n", + "for p in reversed(candidates):\n", + " if (p / \"meta\" / \"config.snapshot.yaml\").exists():\n", + " run_dir = p\n", + " break\n", + "\n", + "if run_dir is None:\n", + " for p in reversed(candidates):\n", + " if (p / \"config.snapshot.yaml\").exists():\n", + " run_dir = p\n", + " break\n", + "\n", + "if run_dir is None:\n", + " raise FileNotFoundError(\"No run folder with config snapshot found under RUNS_DIR\")\n", + "\n", + "print(\"Run dir:\", run_dir)\n", + "\n", + "config_path = run_dir / \"meta\" / \"config.snapshot.yaml\"\n", + "env_path = run_dir / \"meta\" / \"env.json\"\n", + "manifest_path = run_dir / \"meta\" / \"manifest.json\"\n", + "\n", + "if not config_path.exists():\n", + " config_path = run_dir / \"config.snapshot.yaml\"\n", + " env_path = run_dir / \"env.json\"\n", + "\n", + "config_text = config_path.read_text()\n", + "print(config_text[:400])\n", + "\n", + "if manifest_path.exists():\n", + " manifest = json.loads(manifest_path.read_text())\n", + " print(\"Jobs in manifest:\", len(manifest.get(\"jobs\", [])))\n", + "\n", + "df = pd.read_csv(run_dir / \"results.csv\")\n", + "df.head()\n" + ], + "id": "ckY1HmQam0UU" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gpkb4-1Em0UW" + }, + "source": [ + "## 2x2 Bounded Matrix Smoke (Plan A+ Pareto)\n", + "\n", + "Run exactly **2 tasks x 2 trainers x 1 seed = 4 jobs** and verify `results.csv` has 4 rows." + ], + "id": "gpkb4-1Em0UW" + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "dMn7PDVgm0UX", + "outputId": "c37fef05-49b8-4180-dbc9-4b32fd20d45c", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "gpkb4-1Em0UW" - }, - "source": [ - "## 2x2 Bounded Matrix Smoke (Plan A+ Pareto)\n", - "\n", - "Run exactly **2 tasks x 2 trainers x 1 seed = 4 jobs** and verify `results.csv` has 4 rows." - ], - "id": "gpkb4-1Em0UW" + "output_type": "stream", + "name": "stdout", + "text": [ + "=== 2x2 Matrix Smoke (mode=real) ===\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with only long-term memory.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -3.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -3.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 3\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with only long-term memory.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1000000.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1000000.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code:1: import numpy as np\n", + "import math\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " grid_size = int(np.ceil(np.sqrt(n)))\n", + " radius = 0.5 / grid_size\n", + "\n", + " circles = []\n", + " for i in range(n):\n", + " row = i // grid_size\n", + " col = i % grid_size\n", + " x = (col + 0.5) / grid_size\n", + " y = (row + 0.5) / grid_size\n", + " circles.append([x, y, radius])\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.6499617928349034\n", + "[Step 1] \u001b[94mAlgo/Average train score: -749999.8375095518\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.6499617928349034\n", + "[Step 1] Update/best_candidate_mean_score: 0.6499617928349034\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: -499999.67501910357\n", + "[Step 1] Update/exploration_candidates_mean_score: -499999.67501910357\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: -499999.67501910357\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code:1: import numpy as np\n", + "import math\n", + "\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + "\n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + "\n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + " np.random.seed(2025)\n", + " \n", + " circles = []\n", + " for _ in range(n):\n", + " radius = np.random.rand() * 0.05 # Variable radius, capped to keep circles small\n", + " x, y = np.random.rand(2) * (1 - 2 * radius) + radius # Ensures circles fit in unit square\n", + "\n", + " # Check for overlapping\n", + " while any(np.linalg.norm([x - circle[0], y - circle[1]]) < (radius + circle[2]) for circle in circles):\n", + " x, y = np.random.rand(2) * (1 - 2 * radius) + radius # Reposition if overlap detected\n", + "\n", + " circles.append([x, y, radius])\n", + " \n", + " return np.array(circles)\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.4689943904012859\u001b[0m\n" + ] }, { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "dMn7PDVgm0UX", - "outputId": "c37fef05-49b8-4180-dbc9-4b32fd20d45c", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "=== 2x2 Matrix Smoke (mode=real) ===\n", - "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", - "PrioritySearch initialized with only long-term memory.\n", - "Epoch: 0. Iteration: 0\n", - "[Step 0] Test/test_score: -3.0\n", - "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n", - "[Step 0] Update/n_iters: 0\n", - "[Step 0] Update/short_term_memory_size: 0\n", - "[Step 0] Update/long_term_memory_size: 2\n", - "[Step 0] Update/using_short_term_memory: False\n", - "[Step 0] Update/using_long_term_memory: True\n", - "[Step 0] Update/total_samples: 0\n", - "[Step 0] Update/best_candidate_priority: inf\n", - "[Step 0] Update/best_candidate_num_rollouts: 0\n", - "[Step 0] Update/num_exploration_candidates: 2\n", - "[Step 0] Update/exploration_candidates_mean_priority: inf\n", - "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", - "[Step 0] Sample/mean_score: -3.0\n", - "[Step 0] Sample/num_samples: 2\n", - "[Step 0] Sample/self.n_epochs: 0\n", - "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", - "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n", - " return value\u001b[0m\n", - "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n", - "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: 0.0\n", - "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n", - "[Step 1] Update/n_iters: 1\n", - "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 3\n", - "[Step 1] Update/using_short_term_memory: False\n", - "[Step 1] Update/using_long_term_memory: True\n", - "[Step 1] Update/total_samples: 6\n", - "[Step 1] Update/best_candidate_priority: 0.0\n", - "[Step 1] Update/best_candidate_mean_score: 0.0\n", - "[Step 1] Update/best_candidate_num_rollouts: 2\n", - "[Step 1] Update/num_exploration_candidates: 2\n", - "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", - "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", - "[Step 1] Sample/mean_score: 0.0\n", - "[Step 1] Sample/num_samples: 2\n", - "[Step 1] Sample/self.n_epochs: 1\n", - "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n", - " return value\u001b[0m\n", - "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", - "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", - "PrioritySearch initialized with only long-term memory.\n", - "Epoch: 0. Iteration: 0\n", - "[Step 0] Test/test_score: -1000000.0\n", - "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", - "[Step 0] Update/n_iters: 0\n", - "[Step 0] Update/short_term_memory_size: 0\n", - "[Step 0] Update/long_term_memory_size: 2\n", - "[Step 0] Update/using_short_term_memory: False\n", - "[Step 0] Update/using_long_term_memory: True\n", - "[Step 0] Update/total_samples: 0\n", - "[Step 0] Update/best_candidate_priority: inf\n", - "[Step 0] Update/best_candidate_num_rollouts: 0\n", - "[Step 0] Update/num_exploration_candidates: 2\n", - "[Step 0] Update/exploration_candidates_mean_priority: inf\n", - "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", - "[Step 0] Sample/mean_score: -1000000.0\n", - "[Step 0] Sample/num_samples: 2\n", - "[Step 0] Sample/self.n_epochs: 0\n", - "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", - "[Step 0] \u001b[91mParameter/__code:1: import numpy as np\n", - "import math\n", - "def pack_circles(n: int) -> np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - " \n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - " \n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - "\n", - " grid_size = int(np.ceil(np.sqrt(n)))\n", - " radius = 0.5 / grid_size\n", - "\n", - " circles = []\n", - " for i in range(n):\n", - " row = i // grid_size\n", - " col = i % grid_size\n", - " x = (col + 0.5) / grid_size\n", - " y = (row + 0.5) / grid_size\n", - " circles.append([x, y, radius])\n", - "\n", - " return np.array(circles)\u001b[0m\n", - "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: 0.6499617928349034\n", - "[Step 1] \u001b[94mAlgo/Average train score: -749999.8375095518\u001b[0m\n", - "[Step 1] Update/n_iters: 1\n", - "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 5\n", - "[Step 1] Update/using_short_term_memory: False\n", - "[Step 1] Update/using_long_term_memory: True\n", - "[Step 1] Update/total_samples: 6\n", - "[Step 1] Update/best_candidate_priority: 0.6499617928349034\n", - "[Step 1] Update/best_candidate_mean_score: 0.6499617928349034\n", - "[Step 1] Update/best_candidate_num_rollouts: 1\n", - "[Step 1] Update/num_exploration_candidates: 2\n", - "[Step 1] Update/exploration_candidates_mean_priority: -499999.67501910357\n", - "[Step 1] Update/exploration_candidates_mean_score: -499999.67501910357\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", - "[Step 1] Sample/mean_score: -499999.67501910357\n", - "[Step 1] Sample/num_samples: 2\n", - "[Step 1] Sample/self.n_epochs: 1\n", - "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code:1: import numpy as np\n", - "import math\n", - "\n", - "def pack_circles(n: int) -> np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - "\n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - "\n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - " np.random.seed(2025)\n", - " \n", - " circles = []\n", - " for _ in range(n):\n", - " radius = np.random.rand() * 0.05 # Variable radius, capped to keep circles small\n", - " x, y = np.random.rand(2) * (1 - 2 * radius) + radius # Ensures circles fit in unit square\n", - "\n", - " # Check for overlapping\n", - " while any(np.linalg.norm([x - circle[0], y - circle[1]]) < (radius + circle[2]) for circle in circles):\n", - " x, y = np.random.rand(2) * (1 - 2 * radius) + radius # Reposition if overlap detected\n", - "\n", - " circles.append([x, y, radius])\n", - " \n", - " return np.array(circles)\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: 1.4689943904012859\u001b[0m\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_matrix.yaml < /content/m1_matrix.yaml <\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
task_idsuitetrainer_idseedstatusscore_best
0internal:numeric_paraminternalPrioritySearch123ok-0.000000
1internal:numeric_paraminternalGEPA-Base123ok-0.000000
2llm4ad:circle_packingllm4adPrioritySearch123ok0.649962
3llm4ad:circle_packingllm4adGEPA-Base123ok1.468994
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "summary": "{\n \"name\": \"df[[\\\"task_id\\\", \\\"suite\\\", \\\"trainer_id\\\", \\\"seed\\\", \\\"status\\\", \\\"score_best\\\"]]\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad:circle_packing\",\n \"internal:numeric_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad\",\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\",\n \"PrioritySearch\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.697113339555075,\n \"min\": -0.0,\n \"max\": 1.468994390401286,\n \"num_unique_values\": 3,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" - } - }, - "metadata": {}, - "execution_count": 6 - } + "output_type": "execute_result", + "data": { + "text/plain": [ + " task_id suite trainer_id seed status score_best\n", + "0 internal:numeric_param internal PrioritySearch 123 ok -0.000000\n", + "1 internal:numeric_param internal GEPA-Base 123 ok -0.000000\n", + "2 llm4ad:circle_packing llm4ad PrioritySearch 123 ok 0.649962\n", + "3 llm4ad:circle_packing llm4ad GEPA-Base 123 ok 1.468994" ], - "source": [ - "# Verify 2x2 matrix: exactly 4 rows in results.csv\n", - "import json, pathlib, pandas as pd\n", - "\n", - "runs_root = pathlib.Path(RUNS_DIR)\n", - "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", - "\n", - "matrix_dir = None\n", - "for p in reversed(candidates):\n", - " summary_path = p / \"summary.json\"\n", - " if not summary_path.exists():\n", - " continue\n", - " try:\n", - " summary = json.loads(summary_path.read_text())\n", - " except Exception:\n", - " continue\n", - " if summary.get(\"total_jobs\") == 4:\n", - " matrix_dir = p\n", - " break\n", - "\n", - "if matrix_dir is None:\n", - " raise FileNotFoundError(\"No matrix run with total_jobs==4 found. Re-run the matrix cell.\")\n", - "\n", - "print(\"Matrix run dir:\", matrix_dir)\n", - "\n", - "df = pd.read_csv(matrix_dir / \"results.csv\")\n", - "print(f\"\\nresults.csv rows: {len(df)} (expected: 4)\")\n", - "assert len(df) == 4, f\"Expected 4 rows, got {len(df)}\"\n", - "\n", - "summary = json.loads((matrix_dir / \"summary.json\").read_text())\n", - "print(f\"summary.json: {summary}\")\n", - "assert summary.get(\"total_jobs\") == 4\n", - "\n", - "print(\"\\n--- Matrix results ---\")\n", - "df[[\"task_id\", \"suite\", \"trainer_id\", \"seed\", \"status\", \"score_best\"]]\n" + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
task_idsuitetrainer_idseedstatusscore_best
0internal:numeric_paraminternalPrioritySearch123ok-0.000000
1internal:numeric_paraminternalGEPA-Base123ok-0.000000
2llm4ad:circle_packingllm4adPrioritySearch123ok0.649962
3llm4ad:circle_packingllm4adGEPA-Base123ok1.468994
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" ], - "id": "W18tGXfYm0UZ" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.10" - }, - "colab": { - "provenance": [] + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df[[\\\"task_id\\\", \\\"suite\\\", \\\"trainer_id\\\", \\\"seed\\\", \\\"status\\\", \\\"score_best\\\"]]\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad:circle_packing\",\n \"internal:numeric_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad\",\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\",\n \"PrioritySearch\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.697113339555075,\n \"min\": -0.0,\n \"max\": 1.468994390401286,\n \"num_unique_values\": 3,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 6 } + ], + "source": [ + "# Verify 2x2 matrix: exactly 4 rows in results.csv\n", + "import json, pathlib, pandas as pd\n", + "\n", + "runs_root = pathlib.Path(RUNS_DIR)\n", + "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", + "\n", + "matrix_dir = None\n", + "for p in reversed(candidates):\n", + " summary_path = p / \"summary.json\"\n", + " if not summary_path.exists():\n", + " continue\n", + " try:\n", + " summary = json.loads(summary_path.read_text())\n", + " except Exception:\n", + " continue\n", + " if summary.get(\"total_jobs\") == 4:\n", + " matrix_dir = p\n", + " break\n", + "\n", + "if matrix_dir is None:\n", + " raise FileNotFoundError(\"No matrix run with total_jobs==4 found. Re-run the matrix cell.\")\n", + "\n", + "print(\"Matrix run dir:\", matrix_dir)\n", + "\n", + "df = pd.read_csv(matrix_dir / \"results.csv\")\n", + "print(f\"\\nresults.csv rows: {len(df)} (expected: 4)\")\n", + "assert len(df) == 4, f\"Expected 4 rows, got {len(df)}\"\n", + "\n", + "summary = json.loads((matrix_dir / \"summary.json\").read_text())\n", + "print(f\"summary.json: {summary}\")\n", + "assert summary.get(\"total_jobs\") == 4\n", + "\n", + "print(\"\\n--- Matrix results ---\")\n", + "df[[\"task_id\", \"suite\", \"trainer_id\", \"seed\", \"status\", \"score_best\"]]\n" + ], + "id": "W18tGXfYm0UZ" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10" }, - "nbformat": 4, - "nbformat_minor": 5 + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/tests/m1/test_threads_mapping.py b/tests/m1/test_threads_mapping.py new file mode 100644 index 0000000..7746ced --- /dev/null +++ b/tests/m1/test_threads_mapping.py @@ -0,0 +1,32 @@ +import csv +import json +from pathlib import Path + +from trace_bench.config import RunConfig +from trace_bench.runner import BenchRunner + + +def test_threads_maps_to_num_threads(tmp_path): + cfg = RunConfig.from_dict( + { + "mode": "stub", + "seeds": [123], + "tasks": [{"id": "internal:numeric_param"}], + "trainers": [{"id": "PrioritySearch", "params_variants": [{"threads": 3}]}], + } + ) + cfg.runs_dir = str(tmp_path / "runs") + + summary = BenchRunner(cfg).run() + run_dir = Path(cfg.runs_dir) / summary.run_id + + job_dirs = [p for p in (run_dir / "jobs").iterdir() if p.is_dir()] + assert job_dirs, "expected at least one job directory" + meta = json.loads((job_dirs[0] / "job_meta.json").read_text(encoding="utf-8")) + assert meta["resolved_trainer_kwargs"]["num_threads"] == 3 + + with (run_dir / "results.csv").open("r", encoding="utf-8") as f: + rows = list(csv.DictReader(f)) + assert rows, "expected at least one results row" + resolved = json.loads(rows[0]["resolved_trainer_kwargs"]) + assert resolved["num_threads"] == 3 diff --git a/tests/m1/test_veribench_cli.py b/tests/m1/test_veribench_cli.py index d627e5f..086326b 100644 --- a/tests/m1/test_veribench_cli.py +++ b/tests/m1/test_veribench_cli.py @@ -1,17 +1,15 @@ -import pytest - from trace_bench.cli import cmd_list_tasks, cmd_validate -def test_veribench_list_tasks_explicit_failure(): - with pytest.raises(NotImplementedError) as exc: - cmd_list_tasks("LLM4AD/benchmark_tasks", bench="veribench") - assert "awaiting trace team entrypoint/task list" in str(exc.value).lower() +def test_veribench_list_tasks_does_not_fail(): + assert cmd_list_tasks("LLM4AD/benchmark_tasks", bench="veribench") == 0 -def test_veribench_validate_explicit_failure(tmp_path): - config_path = tmp_path / "empty.yaml" - config_path.write_text("tasks: []\n", encoding="utf-8") - with pytest.raises(NotImplementedError) as exc: - cmd_validate(str(config_path), "LLM4AD/benchmark_tasks", bench="veribench") - assert "awaiting trace team entrypoint/task list" in str(exc.value).lower() +def test_veribench_validate_does_not_fail(tmp_path, capsys): + config_path = tmp_path / "veribench.yaml" + config_path.write_text( + "tasks:\n - id: veribench:smoke_placeholder\n", encoding="utf-8" + ) + assert cmd_validate(str(config_path), "LLM4AD/benchmark_tasks", bench="veribench") == 0 + out = capsys.readouterr().out + assert "[SKIP]" in out diff --git a/trace_bench/artifacts.py b/trace_bench/artifacts.py index 40126dc..390d351 100644 --- a/trace_bench/artifacts.py +++ b/trace_bench/artifacts.py @@ -188,7 +188,7 @@ def append_results_csv(path: Path, fieldnames: List[str], row: Dict[str, Any]) - def append_event(path: Path, event: Dict[str, Any]) -> None: with path.open("a", encoding="utf-8") as f: - f.write(json.dumps(event, ensure_ascii=False) + "\n") + f.write(json.dumps(event, ensure_ascii=False, default=str) + "\n") def write_summary(path: Path, summary: Dict[str, Any]) -> None: diff --git a/trace_bench/cli.py b/trace_bench/cli.py index 694af6b..f864fe5 100644 --- a/trace_bench/cli.py +++ b/trace_bench/cli.py @@ -1,12 +1,15 @@ from __future__ import annotations import argparse +import json +from datetime import datetime from pathlib import Path import sys from trace_bench.config import load_config from trace_bench.matrix import compute_run_id, expand_matrix from trace_bench.registry import discover_tasks, discover_trainers, load_task_bundle +from trace_bench.resolve import merge_kwargs, resolve_trainer_kwargs from trace_bench.runner import BenchRunner, _has_trainables from trace_bench.artifacts import init_run_dir, write_manifest from trace_bench.ui import launch_ui @@ -19,9 +22,11 @@ def cmd_list_tasks(root: str, bench: str | None = None) -> int: return 0 -def cmd_list_trainers() -> int: +def cmd_list_trainers(include_all: bool = False) -> int: specs = discover_trainers() for spec in specs: + if not include_all and not spec.available: + continue status = "available" if spec.available else "unavailable" print(f"{spec.id}\t{status}") return 0 @@ -45,6 +50,7 @@ def _task_in_bench(task_key: str, bench: str | None) -> bool: _ALLOWED_TRAINER_KWARGS = { "threads", + "num_threads", "num_epochs", "num_steps", "num_batches", @@ -92,8 +98,16 @@ def _validate_trainer_params(trainer, errors: list[str]) -> None: errors.append(f"logger not found: {trainer.logger}") -def cmd_validate(config_path: str, root: str, bench: str | None = None, strict: bool = False) -> int: +def cmd_validate( + config_path: str, + root: str, + bench: str | None = None, + strict: bool = False, + runs_dir: str | None = None, +) -> int: cfg = load_config(config_path) + if runs_dir: + cfg.runs_dir = runs_dir tasks_root = Path(root) errors = 0 if bench: @@ -112,12 +126,33 @@ def cmd_validate(config_path: str, root: str, bench: str | None = None, strict: print(f"[FAIL] {msg}") errors += len(strict_errors) + bundle_cache: dict[str, dict | None] = {} + + def _bundle_cache_key(task) -> str: + eval_sig = json.dumps(task.eval_kwargs or {}, sort_keys=True) + return f"{task.id}|{eval_sig}" + + def _cache_bundle(task, bundle): + bundle_cache[_bundle_cache_key(task)] = bundle + + def _get_cached_bundle(task): + key = _bundle_cache_key(task) + if key in bundle_cache: + return bundle_cache[key] + try: + bundle = load_task_bundle(task.id, tasks_root, eval_kwargs=task.eval_kwargs) + _cache_bundle(task, bundle) + except Exception: + bundle_cache[key] = None + return bundle_cache.get(key) + for task in cfg.tasks: task_id = task.id if not _task_in_bench(task_id, bench): continue try: bundle = load_task_bundle(task_id, tasks_root, eval_kwargs=task.eval_kwargs) + _cache_bundle(task, bundle) print(f"[OK] {task_id}") if strict: if not _has_trainables(bundle["param"]): @@ -151,6 +186,7 @@ def cmd_validate(config_path: str, root: str, bench: str | None = None, strict: artifacts = init_run_dir(cfg.runs_dir, run_id) manifest = { "run_id": run_id, + "generated_at": datetime.utcnow().isoformat() + "Z", "jobs": [ { "job_id": job.job_id, @@ -158,9 +194,20 @@ def cmd_validate(config_path: str, root: str, bench: str | None = None, strict: "suite": job.suite, "trainer_id": job.trainer_id, "seed": job.seed, - "resolved_trainer_kwargs": job.resolved_kwargs.get("trainer_kwargs", {}), - "resolved_optimizer_kwargs": job.resolved_kwargs.get("optimizer_kwargs", {}), - "eval_kwargs": job.resolved_kwargs.get("eval_kwargs", {}), + "resolved_trainer_kwargs": resolve_trainer_kwargs(job.params, job.trainer_id), + "resolved_optimizer_kwargs": merge_kwargs( + (_get_cached_bundle(job.task) or {}).get("optimizer_kwargs", {}), + job.trainer.optimizer_kwargs or {}, + ), + "resolved_guide_kwargs": merge_kwargs( + (_get_cached_bundle(job.task) or {}).get("guide_kwargs"), + job.trainer.guide_kwargs or {}, + ), + "resolved_logger_kwargs": merge_kwargs( + (_get_cached_bundle(job.task) or {}).get("logger_kwargs"), + job.trainer.logger_kwargs or {}, + ), + "eval_kwargs": dict(job.task.eval_kwargs or {}), } for job in jobs ], @@ -170,10 +217,17 @@ def cmd_validate(config_path: str, root: str, bench: str | None = None, strict: return 1 if errors else 0 -def cmd_run(config_path: str, root: str, runs_dir: str | None = None) -> int: +def cmd_run( + config_path: str, + root: str, + runs_dir: str | None = None, + max_workers: int | None = None, +) -> int: cfg = load_config(config_path) if runs_dir: cfg.runs_dir = runs_dir + if max_workers is not None: + cfg.max_workers = max_workers runner = BenchRunner(cfg, tasks_root=root) runner.run() return 0 @@ -189,20 +243,35 @@ def build_parser() -> argparse.ArgumentParser: list_p = sub.add_parser("list-tasks", help="List discoverable tasks") list_p.add_argument("--root", default="LLM4AD/benchmark_tasks") - list_p.add_argument("--bench", default=None, help="Bench selection: llm4ad,trace_examples,internal,veribench") + list_p.add_argument( + "--bench", + "--dataset-name", + dest="bench", + default=None, + help="Bench selection: llm4ad,trace_examples,internal,veribench", + ) list_t = sub.add_parser("list-trainers", help="List discoverable trainers") + list_t.add_argument("--all", action="store_true", help="Include unavailable trainers") val_p = sub.add_parser("validate", help="Validate tasks in config") val_p.add_argument("--config", required=True) val_p.add_argument("--root", default="LLM4AD/benchmark_tasks") - val_p.add_argument("--bench", default=None, help="Bench selection: llm4ad,trace_examples,internal,veribench") + val_p.add_argument( + "--bench", + "--dataset-name", + dest="bench", + default=None, + help="Bench selection: llm4ad,trace_examples,internal,veribench", + ) val_p.add_argument("--strict", action="store_true") + val_p.add_argument("--runs-dir", "--output-dir", dest="runs_dir", default=None) run_p = sub.add_parser("run", help="Run a benchmark config") run_p.add_argument("--config", required=True) run_p.add_argument("--root", default="LLM4AD/benchmark_tasks") - run_p.add_argument("--runs-dir", default=None) + run_p.add_argument("--runs-dir", "--output-dir", dest="runs_dir", default=None) + run_p.add_argument("--max-workers", "--n-concurrent", dest="max_workers", type=int, default=None) ui_p = sub.add_parser("ui", help="Launch Gradio UI (stub)") ui_p.add_argument("--runs-dir", default="runs") @@ -217,11 +286,11 @@ def main(argv: list[str] | None = None) -> int: if args.cmd == "list-tasks": return cmd_list_tasks(args.root, args.bench) if args.cmd == "list-trainers": - return cmd_list_trainers() + return cmd_list_trainers(args.all) if args.cmd == "validate": - return cmd_validate(args.config, args.root, args.bench, args.strict) + return cmd_validate(args.config, args.root, args.bench, args.strict, args.runs_dir) if args.cmd == "run": - return cmd_run(args.config, args.root, args.runs_dir) + return cmd_run(args.config, args.root, args.runs_dir, args.max_workers) if args.cmd == "ui": return cmd_ui(args.runs_dir) return 1 diff --git a/trace_bench/config.py b/trace_bench/config.py index 301fec8..6d89237 100644 --- a/trace_bench/config.py +++ b/trace_bench/config.py @@ -9,6 +9,7 @@ _LLM4AD_KNOBS = { "threads", + "num_threads", "optimizer_kwargs", "eval_kwargs", "ps_steps", @@ -110,7 +111,11 @@ def from_dict(cls, data: Dict[str, Any]) -> "RunConfig": else: seeds = [int(x) for x in (seeds or [])] or [123] - max_workers = int(data.get("max_workers", data.get("threads", 1))) + if "max_workers" in data: + max_workers = data.get("max_workers") + else: + max_workers = data.get("n_concurrent", data.get("n-concurrent", 1)) + max_workers = int(max_workers) fail_fast = bool(data.get("fail_fast", False)) default_eval = _as_dict(data.get("eval_kwargs")) diff --git a/trace_bench/matrix.py b/trace_bench/matrix.py index 158f0df..ea0f232 100644 --- a/trace_bench/matrix.py +++ b/trace_bench/matrix.py @@ -8,6 +8,7 @@ import subprocess from trace_bench.config import RunConfig, TaskConfig, TrainerConfig +from trace_bench.resolve import resolve_trainer_kwargs def _git_sha() -> str: @@ -46,7 +47,7 @@ def task_suite(task_id: str) -> str: def resolve_job_kwargs(task: TaskConfig, trainer: TrainerConfig, params: Dict[str, Any]) -> Dict[str, Any]: return { - "trainer_kwargs": dict(params), + "trainer_kwargs": resolve_trainer_kwargs(params, trainer.id), "optimizer": trainer.optimizer, "optimizer_kwargs": dict(trainer.optimizer_kwargs or {}), "guide": trainer.guide, diff --git a/trace_bench/registry.py b/trace_bench/registry.py index 66a10a7..8096e17 100644 --- a/trace_bench/registry.py +++ b/trace_bench/registry.py @@ -3,9 +3,12 @@ from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Set +import ast import importlib import importlib.util +import inspect import json +import pkgutil import sys @@ -30,6 +33,17 @@ class TrainerSpec: "internal:non_trainable": "internal_non_trainable", } +_TRAINER_ALIASES = { + "GEPAAlgorithmBase": "GEPA-Base", + "GEPAUCBSearch": "GEPA-UCB", + "GEPABeamPareto": "GEPA-Beam", +} + +_VERIBENCH_UNAVAILABLE = ( + "veribench_unavailable: entrypoint not available (install Veribench or provide task list)" +) +_VERIBENCH_PLACEHOLDER = "veribench:smoke_placeholder" + def _repo_root() -> Path: return Path(__file__).resolve().parents[1] @@ -100,27 +114,80 @@ def discover_internal() -> List[TaskSpec]: ] def discover_veribench() -> List[TaskSpec]: - raise NotImplementedError("VeriBench tasks not yet wired: awaiting Trace team entrypoint/task list.") + # Always return a placeholder task so CLI/validate can skip with a reason. + if importlib.util.find_spec("veribench") is None: + return [TaskSpec(id=_VERIBENCH_PLACEHOLDER, suite="veribench", module="veribench_unavailable")] + # Entry point not wired yet; keep placeholder until a task list is provided. + return [TaskSpec(id=_VERIBENCH_PLACEHOLDER, suite="veribench", module="veribench_unavailable")] + + +def _iter_module_names(package_name: str) -> Iterable[str]: + try: + package = importlib.import_module(package_name) + except Exception: + return [] + names: List[str] = [package.__name__] + if hasattr(package, "__path__"): + for module_info in pkgutil.walk_packages(package.__path__, package.__name__ + "."): + names.append(module_info.name) + return names + + +def _class_names_from_file(module_name: str) -> List[str]: + spec = importlib.util.find_spec(module_name) + if spec is None or not spec.origin or not spec.origin.endswith(".py"): + return [] + try: + source = Path(spec.origin).read_text(encoding="utf-8") + tree = ast.parse(source) + except Exception: + return [] + names: List[str] = [] + for node in tree.body: + if not isinstance(node, ast.ClassDef): + continue + base_names: List[str] = [] + for base in node.bases: + if isinstance(base, ast.Name): + base_names.append(base.id) + elif isinstance(base, ast.Attribute): + base_names.append(base.attr) + if any(name.endswith("Trainer") or name.endswith("Algorithm") for name in base_names): + if node.name in {"Trainer", "Algorithm"}: + continue + names.append(node.name) + return names def discover_trainers() -> List[TrainerSpec]: ensure_opto_importable() - candidates = [ - ("PrioritySearch", "opto.features.priority_search", "PrioritySearch"), - ("GEPA-Base", "opto.features.gepa.gepa_algorithms", "GEPAAlgorithmBase"), - ("GEPA-UCB", "opto.features.gepa.gepa_algorithms", "GEPAUCBSearch"), - ("GEPA-Beam", "opto.features.gepa.gepa_algorithms", "GEPABeamPareto"), - ] - specs: List[TrainerSpec] = [] - for trainer_id, module, symbol in candidates: - available = True + from opto.trainer.algorithms.algorithm import Trainer as TrainerBase + + specs: Dict[str, TrainerSpec] = {} + module_names: List[str] = [] + module_names.extend(_iter_module_names("opto.trainer.algorithms")) + module_names.extend(_iter_module_names("opto.features")) + + for module_name in sorted(set(module_names)): try: - mod = importlib.import_module(module) - getattr(mod, symbol) + module = importlib.import_module(module_name) except Exception: - available = False - specs.append(TrainerSpec(id=trainer_id, source=module, available=available)) - return specs + for class_name in _class_names_from_file(module_name): + trainer_id = _TRAINER_ALIASES.get(class_name, class_name) + if trainer_id not in specs: + specs[trainer_id] = TrainerSpec(id=trainer_id, source=module_name, available=False) + continue + + for _name, obj in vars(module).items(): + if not inspect.isclass(obj): + continue + if obj is TrainerBase: + continue + if not issubclass(obj, TrainerBase): + continue + trainer_id = _TRAINER_ALIASES.get(obj.__name__, obj.__name__) + specs[trainer_id] = TrainerSpec(id=trainer_id, source=obj.__module__, available=True) + return sorted(specs.values(), key=lambda spec: spec.id) def _parse_bench(bench: Optional[str]) -> Set[str]: @@ -171,7 +238,7 @@ def load_task_module(task_id: str, tasks_root: str | Path): module_name = _INTERNAL_TASKS.get(task_id, task_id.split(":", 1)[1]) return importlib.import_module(f"trace_bench.examples.{module_name}") if task_id.startswith("veribench:"): - raise NotImplementedError("VeriBench tasks not yet wired: awaiting Trace team entrypoint/task list.") + raise NotImplementedError(_VERIBENCH_UNAVAILABLE) ensure_llm4ad_importable(root) mapping = {spec.id.split(":", 1)[1]: spec.module for spec in discover_llm4ad(root)} @@ -194,7 +261,7 @@ def load_task_module(task_id: str, tasks_root: str | Path): def load_task_bundle(task_id: str, tasks_root: str | Path, eval_kwargs: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: task_id = _normalize_task_id(task_id) if task_id.startswith("veribench:"): - raise NotImplementedError("VeriBench tasks not yet wired: awaiting Trace team entrypoint/task list.") + raise NotImplementedError(_VERIBENCH_UNAVAILABLE) mod = load_task_module(task_id, tasks_root) if not hasattr(mod, "build_trace_problem"): raise AttributeError(f"Task module {task_id} missing build_trace_problem") diff --git a/trace_bench/resolve.py b/trace_bench/resolve.py new file mode 100644 index 0000000..e285341 --- /dev/null +++ b/trace_bench/resolve.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from typing import Any, Dict, List + + +_FILTERED_KWARGS = {"eval_kwargs", "optimizer_kwargs"} + + +def _default_trainer_kwargs(algo_name: str) -> Dict[str, Any]: + if algo_name == "PrioritySearch": + return dict(num_epochs=1, num_steps=1, num_batches=1, num_candidates=2, num_proposals=2) + if algo_name == "GEPA-Base": + return dict(num_iters=1, train_batch_size=2, merge_every=2, pareto_subset_size=2) + # GEPA-UCB and GEPA-Beam use num_search_iterations + return dict(num_search_iterations=1, train_batch_size=2, merge_every=2, pareto_subset_size=2) + + +def _param_alias_map(algo_name: str) -> Dict[str, str]: + base = { + "threads": "num_threads", + "ps_steps": "num_steps", + "ps_batches": "num_batches", + "ps_candidates": "num_candidates", + "ps_proposals": "num_proposals", + "ps_mem_update": "memory_update_frequency", + "gepa_train_bs": "train_batch_size", + "gepa_merge_every": "merge_every", + "gepa_pareto_subset": "pareto_subset_size", + } + if algo_name == "GEPA-Base": + base["gepa_iters"] = "num_iters" + else: + base["gepa_iters"] = "num_search_iterations" + return base + + +def resolve_trainer_kwargs(params: Dict[str, Any], algo_name: str) -> Dict[str, Any]: + kwargs = _default_trainer_kwargs(algo_name) + alias_map = _param_alias_map(algo_name) + for key, value in (params or {}).items(): + if key in _FILTERED_KWARGS: + continue + mapped_key = alias_map.get(key, key) + kwargs[mapped_key] = value + return kwargs + + +def _clone(value: Any) -> Any: + if isinstance(value, dict): + return {k: _clone(v) for k, v in value.items()} + if isinstance(value, list): + return [_clone(v) for v in value] + return value + + +def merge_kwargs(base: Any, override: Any) -> Any: + if override is None: + return _clone(base) + if base is None: + return _clone(override) + if isinstance(base, dict) and isinstance(override, dict): + merged = dict(base) + merged.update(override) + return merged + if isinstance(base, list) and isinstance(override, dict): + if not base: + return [_clone(override)] + return [ + merge_kwargs(item, override) if isinstance(item, (dict, list)) else _clone(item) + for item in base + ] + if isinstance(base, dict) and isinstance(override, list): + if not override: + return _clone(base) + return [ + merge_kwargs(base, item) if isinstance(item, (dict, list)) else _clone(item) + for item in override + ] + if isinstance(base, list) and isinstance(override, list): + merged: List[Any] = [] + max_len = max(len(base), len(override)) + for idx in range(max_len): + left = base[idx] if idx < len(base) else None + right = override[idx] if idx < len(override) else None + if left is None: + merged.append(_clone(right)) + elif right is None: + merged.append(_clone(left)) + else: + merged.append(merge_kwargs(left, right)) + return merged + return _clone(override) + + +__all__ = ["resolve_trainer_kwargs", "merge_kwargs"] diff --git a/trace_bench/results.py b/trace_bench/results.py index 3fcb4a9..d19402e 100644 --- a/trace_bench/results.py +++ b/trace_bench/results.py @@ -61,14 +61,22 @@ def build_results_row( "score_final": score_final, "score_best": score_best, "time_seconds": round(time_seconds, 6), - "resolved_trainer_kwargs": _json_cell(resolved_trainer_kwargs), - "resolved_optimizer_kwargs": _json_cell(resolved_optimizer_kwargs), - "eval_kwargs": _json_cell(eval_kwargs), + "resolved_trainer_kwargs": resolved_trainer_kwargs, + "resolved_optimizer_kwargs": resolved_optimizer_kwargs, + "eval_kwargs": eval_kwargs, "feedback": feedback or "", "tb_logdir": tb_logdir, } +def build_results_csv_row(row: Dict[str, Any]) -> Dict[str, Any]: + csv_row = dict(row) + csv_row["resolved_trainer_kwargs"] = _json_cell(row.get("resolved_trainer_kwargs")) + csv_row["resolved_optimizer_kwargs"] = _json_cell(row.get("resolved_optimizer_kwargs")) + csv_row["eval_kwargs"] = _json_cell(row.get("eval_kwargs")) + return csv_row + + def summarize_results(rows: List[Dict[str, Any]]) -> Dict[str, Any]: counts: Dict[str, int] = {"ok": 0, "failed": 0, "skipped": 0} for row in rows: @@ -79,4 +87,4 @@ def summarize_results(rows: List[Dict[str, Any]]) -> Dict[str, Any]: return {"counts": counts, "total_jobs": len(rows)} -__all__ = ["RESULT_COLUMNS", "build_results_row", "summarize_results"] +__all__ = ["RESULT_COLUMNS", "build_results_row", "build_results_csv_row", "summarize_results"] diff --git a/trace_bench/runner.py b/trace_bench/runner.py index 6581c4e..20532e3 100644 --- a/trace_bench/runner.py +++ b/trace_bench/runner.py @@ -3,7 +3,8 @@ from dataclasses import dataclass from datetime import datetime from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple +import json import random import time @@ -21,10 +22,11 @@ write_job_results, write_summary, ) -from trace_bench.config import RunConfig, TrainerConfig +from trace_bench.config import RunConfig, TaskConfig, TrainerConfig from trace_bench.matrix import JobSpec, compute_run_id, expand_matrix from trace_bench.registry import load_task_bundle -from trace_bench.results import RESULT_COLUMNS, build_results_row, summarize_results +from trace_bench.resolve import merge_kwargs, resolve_trainer_kwargs +from trace_bench.results import RESULT_COLUMNS, build_results_csv_row, build_results_row, summarize_results try: @@ -80,69 +82,20 @@ def _resolve_algorithm(name: str): return name -def _default_trainer_kwargs(algo_name: str) -> Dict[str, Any]: - if algo_name == "PrioritySearch": - return dict(num_epochs=1, num_steps=1, num_batches=1, num_candidates=2, num_proposals=2) - if algo_name == "GEPA-Base": - return dict(num_iters=1, train_batch_size=2, merge_every=2, pareto_subset_size=2) - # GEPA-UCB and GEPA-Beam use num_search_iterations - return dict(num_search_iterations=1, train_batch_size=2, merge_every=2, pareto_subset_size=2) - - -def _param_alias_map(algo_name: str) -> Dict[str, str]: - """Return config-alias → opto-kwarg mapping for the given algorithm.""" - base = { - "ps_steps": "num_steps", - "ps_batches": "num_batches", - "ps_candidates": "num_candidates", - "ps_proposals": "num_proposals", - "ps_mem_update": "memory_update_frequency", - "gepa_train_bs": "train_batch_size", - "gepa_merge_every": "merge_every", - "gepa_pareto_subset": "pareto_subset_size", - } - if algo_name == "GEPA-Base": - base["gepa_iters"] = "num_iters" - else: - base["gepa_iters"] = "num_search_iterations" - return base - - -# Keys that should NOT be passed to opto_trainer.train() -_FILTERED_KWARGS = {"eval_kwargs", "optimizer_kwargs", "threads"} - - -def _resolve_train_kwargs(params: Dict[str, Any], algo_name: str) -> Dict[str, Any]: - """Map config aliases to actual train() kwargs and filter non-train keys.""" - kwargs = _default_trainer_kwargs(algo_name) - alias_map = _param_alias_map(algo_name) - for key, value in params.items(): - if key in _FILTERED_KWARGS: - continue - mapped_key = alias_map.get(key, key) - kwargs[mapped_key] = value - return kwargs - - def _train_bundle(bundle: Dict[str, Any], trainer_spec: TrainerConfig, params: Dict[str, Any], mode: str) -> Dict[str, Any]: from opto import trainer as opto_trainer algo_name = trainer_spec.id algo = _resolve_algorithm(algo_name) - kwargs = _resolve_train_kwargs(params, algo_name) + kwargs = resolve_trainer_kwargs(params, algo_name) optimizer = trainer_spec.optimizer guide = trainer_spec.guide or bundle["guide"] logger = trainer_spec.logger or "ConsoleLogger" - guide_kwargs = trainer_spec.guide_kwargs or {} - logger_kwargs = trainer_spec.logger_kwargs or {} + guide_kwargs = merge_kwargs(bundle.get("guide_kwargs"), trainer_spec.guide_kwargs or {}) + logger_kwargs = merge_kwargs(bundle.get("logger_kwargs"), trainer_spec.logger_kwargs or {}) - optimizer_kwargs = bundle.get("optimizer_kwargs", {}) - override_opt_kwargs = trainer_spec.optimizer_kwargs or None - if override_opt_kwargs: - optimizer_kwargs = override_opt_kwargs - if isinstance(optimizer_kwargs, dict): - optimizer_kwargs = dict(optimizer_kwargs) + optimizer_kwargs = merge_kwargs(bundle.get("optimizer_kwargs", {}), trainer_spec.optimizer_kwargs or {}) if mode == "stub": try: @@ -196,6 +149,26 @@ def __init__(self, config: RunConfig, tasks_root: str | Path = "LLM4AD/benchmark self.tasks_root = Path(tasks_root) random.seed(self.config.seeds[0] if self.config.seeds else 123) self.artifacts: Optional[RunArtifacts] = None + self._bundle_cache: Dict[str, Dict[str, Any]] = {} + + def _bundle_cache_key(self, task: TaskConfig) -> str: + eval_sig = json.dumps(task.eval_kwargs or {}, sort_keys=True) + return f"{task.id}|{eval_sig}" + + def _get_bundle(self, task: TaskConfig) -> Tuple[str, Optional[Dict[str, Any]], Optional[str]]: + key = self._bundle_cache_key(task) + if key in self._bundle_cache: + cached = self._bundle_cache[key] + return cached["status"], cached.get("bundle"), cached.get("error") + try: + bundle = load_task_bundle(task.id, self.tasks_root, eval_kwargs=task.eval_kwargs) + entry = {"status": "ok", "bundle": bundle, "error": None} + except NotImplementedError as exc: + entry = {"status": "skipped", "bundle": None, "error": str(exc)} + except Exception as exc: + entry = {"status": "failed", "bundle": None, "error": f"task_load_error: {exc}"} + self._bundle_cache[key] = entry + return entry["status"], entry.get("bundle"), entry.get("error") def run(self) -> RunSummary: snapshot = self.config.snapshot() @@ -209,31 +182,55 @@ def run(self) -> RunSummary: write_git_json(self.artifacts.git_json) jobs = expand_matrix(self.config) - manifest = { - "run_id": run_id, - "generated_at": datetime.utcnow().isoformat() + "Z", - "jobs": [ + + results: List[Dict[str, Any]] = [] + for job in jobs: + results.append(self._run_job(job)) + if self.config.fail_fast and results[-1].get("status") == "failed": + break + + result_by_job = {row.get("job_id"): row for row in results} + manifest_jobs: List[Dict[str, Any]] = [] + for job in jobs: + row = result_by_job.get(job.job_id, {}) + resolved_trainer_kwargs = resolve_trainer_kwargs(job.params, job.trainer_id) + status_hint, bundle, skip_reason = self._get_bundle(job.task) + resolved_optimizer_kwargs = merge_kwargs( + bundle.get("optimizer_kwargs", {}) if bundle else {}, + job.trainer.optimizer_kwargs or {}, + ) + resolved_guide_kwargs = merge_kwargs( + bundle.get("guide_kwargs") if bundle else {}, + job.trainer.guide_kwargs or {}, + ) + resolved_logger_kwargs = merge_kwargs( + bundle.get("logger_kwargs") if bundle else {}, + job.trainer.logger_kwargs or {}, + ) + eval_kwargs = row.get("eval_kwargs") or dict(job.task.eval_kwargs or {}) + manifest_jobs.append( { "job_id": job.job_id, "task_id": job.task_id, "suite": job.suite, "trainer_id": job.trainer_id, "seed": job.seed, - "resolved_trainer_kwargs": job.resolved_kwargs.get("trainer_kwargs", {}), - "resolved_optimizer_kwargs": job.resolved_kwargs.get("optimizer_kwargs", {}), - "eval_kwargs": job.resolved_kwargs.get("eval_kwargs", {}), + "resolved_trainer_kwargs": resolved_trainer_kwargs, + "resolved_optimizer_kwargs": resolved_optimizer_kwargs, + "resolved_guide_kwargs": resolved_guide_kwargs, + "resolved_logger_kwargs": resolved_logger_kwargs, + "eval_kwargs": eval_kwargs, + "status_hint": status_hint, + "skip_reason": skip_reason or "", } - for job in jobs - ], + ) + manifest = { + "run_id": run_id, + "generated_at": datetime.utcnow().isoformat() + "Z", + "jobs": manifest_jobs, } write_manifest(self.artifacts.manifest_json, manifest) - results: List[Dict[str, Any]] = [] - for job in jobs: - results.append(self._run_job(job)) - if self.config.fail_fast and results[-1].get("status") == "failed": - break - write_summary(self.artifacts.summary_json, summarize_results(results)) return RunSummary(run_id=run_id, results=results) @@ -244,24 +241,21 @@ def _run_job(self, job: JobSpec) -> Dict[str, Any]: status = "ok" feedback: Optional[str] = None - try: - bundle = load_task_bundle(job.task_id, self.tasks_root, eval_kwargs=job.task.eval_kwargs) - except NotImplementedError as exc: - status = "skipped" - feedback = str(exc) - bundle = None - except Exception as exc: - status = "failed" - feedback = f"task_load_error: {exc}" - bundle = None + status_hint, bundle, bundle_error = self._get_bundle(job.task) + if status_hint != "ok": + status = status_hint + feedback = bundle_error score_initial = None score_final = None score_best = None resolved_optimizer_kwargs: Dict[str, Any] = dict(job.trainer.optimizer_kwargs or {}) - resolved_trainer_kwargs: Dict[str, Any] = dict(job.params) + resolved_trainer_kwargs: Dict[str, Any] = resolve_trainer_kwargs(job.params, job.trainer_id) if bundle is not None and status == "ok": + resolved_optimizer_kwargs = merge_kwargs( + bundle.get("optimizer_kwargs", {}), job.trainer.optimizer_kwargs or {} + ) if not _has_trainables(bundle["param"]): status = "failed" feedback = "no_trainable_parameters" @@ -270,7 +264,7 @@ def _run_job(self, job: JobSpec) -> Dict[str, Any]: score_initial = initial.get("score") train_result = _train_bundle(bundle, job.trainer, job.params, self.config.mode) status = train_result.get("status", "ok") - resolved_optimizer_kwargs = train_result.get("optimizer_kwargs") or {} + resolved_optimizer_kwargs = train_result.get("optimizer_kwargs") or resolved_optimizer_kwargs resolved_trainer_kwargs = train_result.get("trainer_kwargs") or resolved_trainer_kwargs if status == "failed": feedback = f"training_error: {train_result.get('error', 'unknown')}" @@ -304,6 +298,14 @@ def _run_job(self, job: JobSpec) -> Dict[str, Any]: feedback=feedback, tb_logdir=tb_rel, ) + resolved_guide_kwargs = merge_kwargs( + bundle.get("guide_kwargs") if bundle else {}, + job.trainer.guide_kwargs, + ) + resolved_logger_kwargs = merge_kwargs( + bundle.get("logger_kwargs") if bundle else {}, + job.trainer.logger_kwargs, + ) job_meta = { "job_id": job.job_id, "task_id": job.task_id, @@ -314,6 +316,8 @@ def _run_job(self, job: JobSpec) -> Dict[str, Any]: "params": job.params, "resolved_trainer_kwargs": resolved_trainer_kwargs, "resolved_optimizer_kwargs": resolved_optimizer_kwargs, + "resolved_guide_kwargs": resolved_guide_kwargs, + "resolved_logger_kwargs": resolved_logger_kwargs, "optimizer": job.trainer.optimizer, "optimizer_kwargs": job.trainer.optimizer_kwargs, "guide": job.trainer.guide, @@ -325,7 +329,7 @@ def _run_job(self, job: JobSpec) -> Dict[str, Any]: "tb_logdir": tb_rel, } write_job_meta(job_artifacts.job_meta, job_meta) - append_results_csv(self.artifacts.results_csv, RESULT_COLUMNS, row) + append_results_csv(self.artifacts.results_csv, RESULT_COLUMNS, build_results_csv_row(row)) append_event(job_artifacts.events_jsonl, row) write_job_results(job_artifacts.results_json, row) return row From 51622f25c26a37ff1832a79fad3bc03438f3a262 Mon Sep 17 00:00:00 2001 From: Asad Date: Wed, 11 Feb 2026 10:58:50 +0500 Subject: [PATCH 4/8] Update 01_m1_minimal_api.ipynb --- notebooks/01_m1_minimal_api.ipynb | 3072 +++++++++++++++-------------- 1 file changed, 1547 insertions(+), 1525 deletions(-) diff --git a/notebooks/01_m1_minimal_api.ipynb b/notebooks/01_m1_minimal_api.ipynb index 4d8670c..d6114aa 100644 --- a/notebooks/01_m1_minimal_api.ipynb +++ b/notebooks/01_m1_minimal_api.ipynb @@ -1,1545 +1,1567 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "euYNX4m-m0Ty" - }, - "source": [ - "# Trace-Bench M1 \u2014 Minimal API Validation\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guru-code-expert/Trace-Bench/blob/m1/deliverable/notebooks/01_m1_minimal_api.ipynb)\n", - "\n", - "This notebook validates the **M1 contracts**: canonical artifacts, deterministic IDs, and minimal runnable coverage across benches.\n", - "\n", - "**Mode policy**: defaults to **real** (uses API key if present). If no key is found, falls back to **stub** with a clear warning and STUB label on outputs." - ], - "id": "euYNX4m-m0Ty" - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u5DVjcAAm0UH" - }, - "source": [ - "## Expected Outputs\n", - "\n", - "- A new `runs//` folder with `meta/` + `jobs/` layout.\n", - "- `meta/config.snapshot.yaml`, `meta/manifest.json`, `meta/env.json` exist.\n", - "- `results.csv` contains `status` values (`ok`/`failed`/`skipped`).\n", - "- Internal non-trainable job shows `status=failed` with reason.\n", - "- If running in **real** mode, artifacts show `mode=real` and LLM4AD task produces a score.\n", - "- **2x2 matrix smoke**: `results.csv` with exactly 4 rows from 2 tasks x 2 trainers x 1 seed." - ], - "id": "u5DVjcAAm0UH" - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "8D3DGyVXm0UJ", - "outputId": "aadad0ba-037c-4ffc-8d5a-4c55fb9d0d3f", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Mounted at /content/drive\n", - "Runs dir: /content/drive/MyDrive/bench/2026-02-09/trace_bench\n", - "API key found \u2014 running in REAL mode (model: gpt-4o-mini)\n", - "\n", - "Mode: real\n" - ] - } - ], - "source": [ - "# Mount Drive (optional) + compute persistent runs_dir + detect API key\n", - "from datetime import date\n", - "from pathlib import Path\n", - "import os\n", - "\n", - "try:\n", - " from google.colab import drive\n", - " drive.mount(\"/content/drive\")\n", - "except Exception:\n", - " pass\n", - "\n", - "\n", - "def bench_dir(project=\"bench\", sub=\"trace_bench\", local=\"/content/bench\"):\n", - " drive_root = Path(\"/content/drive/MyDrive\")\n", - " root = drive_root if drive_root.is_dir() else Path(local)\n", - " out = root / project / date.today().isoformat() / sub\n", - " out.mkdir(parents=True, exist_ok=True)\n", - " return str(out)\n", - "\n", - "RUNS_DIR = bench_dir()\n", - "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n", - "print(\"Runs dir:\", RUNS_DIR)\n", - "\n", - "# --- Auto-detect API key (real mode by default) ---\n", - "API_KEY = os.environ.get(\"OPENROUTER_API_KEY\", \"\")\n", - "if not API_KEY:\n", - " try:\n", - " from google.colab import userdata\n", - " API_KEY = userdata.get(\"OPENROUTER_API_KEY\") or \"\"\n", - " except Exception:\n", - " pass\n", - "\n", - "if API_KEY:\n", - " os.environ[\"OPENROUTER_API_KEY\"] = API_KEY\n", - " os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n", - " os.environ[\"TRACE_LITELLM_MODEL\"] = \"openrouter/openai/gpt-4o-mini\"\n", - " MODE = \"real\"\n", - " print(f\"API key found \u2014 running in REAL mode (model: gpt-4o-mini)\")\n", - "else:\n", - " MODE = \"stub\"\n", - " print(\"WARNING: No OPENROUTER_API_KEY found. Falling back to STUB mode.\")\n", - " print(\" All outputs below are labeled STUB \u2014 not real LLM results.\")\n", - "\n", - "os.environ[\"TB_MODE\"] = MODE\n", - "print(f\"\\nMode: {MODE}\")" - ], - "id": "8D3DGyVXm0UJ" - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "swOi3Bhtm0UQ", - "outputId": "e9806308-35f8-48c5-e6b2-e5f46530a497", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ + "cells": [ { - "output_type": "stream", - "name": "stdout", - "text": [ - "Cloning into 'Trace-Bench'...\n", - "remote: Enumerating objects: 315, done.\u001b[K\n", - "remote: Counting objects: 100% (315/315), done.\u001b[K\n", - "remote: Compressing objects: 100% (222/222), done.\u001b[K\n", - "remote: Total 315 (delta 42), reused 274 (delta 36), pack-reused 0 (from 0)\u001b[K\n", - "Receiving objects: 100% (315/315), 3.86 MiB | 8.12 MiB/s, done.\n", - "Resolving deltas: 100% (42/42), done.\n", - "Cloning into 'OpenTrace'...\n", - "remote: Enumerating objects: 228, done.\u001b[K\n", - "remote: Counting objects: 100% (228/228), done.\u001b[K\n", - "remote: Compressing objects: 100% (205/205), done.\u001b[K\n", - "remote: Total 228 (delta 17), reused 114 (delta 13), pack-reused 0 (from 0)\u001b[K\n", - "Receiving objects: 100% (228/228), 4.73 MiB | 14.77 MiB/s, done.\n", - "Resolving deltas: 100% (17/17), done.\n", - "/content/Trace-Bench\n", - "Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n", - "Get:2 https://cli.github.com/packages stable InRelease [3,917 B]\n", - "Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [85.0 kB]\n", - "Get:4 https://cli.github.com/packages stable/main amd64 Packages [356 B]\n", - "Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease\n", - "Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n", - "Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n", - "Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n", - "Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,893 kB]\n", - "Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]\n", - "Get:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]\n", - "Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n", - "Get:13 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [6,396 kB]\n", - "Get:14 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [38.8 kB]\n", - "Get:15 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [6,661 kB]\n", - "Get:16 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy/main amd64 Packages [75.3 kB]\n", - "Get:17 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,716 kB]\n", - "Get:18 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,683 kB]\n", - "Get:19 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [62.6 kB]\n", - "Get:20 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,297 kB]\n", - "Get:21 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [70.9 kB]\n", - "Get:22 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [4,035 kB]\n", - "Get:23 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,609 kB]\n", - "Fetched 37.1 MB in 6s (6,435 kB/s)\n", - "Reading package lists... Done\n", - "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n", - "Reading package lists... Done\n", - "Building dependency tree... Done\n", - "Reading state information... Done\n", - "graphviz is already the newest version (2.42.2-6ubuntu0.1).\n", - "0 upgraded, 0 newly installed, 0 to remove and 55 not upgraded.\n", - "Requirement already satisfied: pip in /usr/local/lib/python3.12/dist-packages (24.1.2)\n", - "Collecting pip\n", - " Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)\n", - "Downloading pip-26.0.1-py3-none-any.whl (1.8 MB)\n", - "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: pip\n", - " Attempting uninstall: pip\n", - " Found existing installation: pip 24.1.2\n", - " Uninstalling pip-24.1.2:\n", - " Successfully uninstalled pip-24.1.2\n", - "Successfully installed pip-26.0.1\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (6.0.3)\n", - "Requirement already satisfied: pytest in /usr/local/lib/python3.12/dist-packages (8.4.2)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", - "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", - "Requirement already satisfied: graphviz in /usr/local/lib/python3.12/dist-packages (0.21)\n", - "Collecting litellm==1.75.0\n", - " Downloading litellm-1.75.0-py3-none-any.whl.metadata (40 kB)\n", - "Requirement already satisfied: aiohttp>=3.10 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.13.3)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.3.1)\n", - "Requirement already satisfied: httpx>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.28.1)\n", - "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.7.1)\n", - "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.1.6)\n", - "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (4.26.0)\n", - "Requirement already satisfied: openai>=1.68.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.16.0)\n", - "Requirement already satisfied: pydantic<3.0.0,>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.12.3)\n", - "Requirement already satisfied: python-dotenv>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (1.2.1)\n", - "Requirement already satisfied: tiktoken>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.12.0)\n", - "Requirement already satisfied: tokenizers in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.22.2)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2<4.0.0,>=3.1.2->litellm==1.75.0) (3.0.3)\n", - "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (25.4.0)\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (2025.9.1)\n", - "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.37.0)\n", - "Requirement already satisfied: rpds-py>=0.25.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.30.0)\n", - "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (2.41.4)\n", - "Requirement already satisfied: typing-extensions>=4.14.1 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (4.15.0)\n", - "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.4.2)\n", - "Requirement already satisfied: iniconfig>=1 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.3.0)\n", - "Requirement already satisfied: packaging>=20 in /usr/local/lib/python3.12/dist-packages (from pytest) (26.0)\n", - "Requirement already satisfied: pluggy<2,>=1.5 in /usr/local/lib/python3.12/dist-packages (from pytest) (1.6.0)\n", - "Requirement already satisfied: pygments>=2.7.2 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.19.2)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", - "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", - "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", - "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (2.9.0.post0)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (2.6.1)\n", - "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.4.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.8.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (6.7.1)\n", - "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (0.4.1)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.22.0)\n", - "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.12/dist-packages (from yarl<2.0,>=1.17.0->aiohttp>=3.10->litellm==1.75.0) (3.11)\n", - "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (4.12.1)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (2026.1.4)\n", - "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (1.0.9)\n", - "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx>=0.23.0->litellm==1.75.0) (0.16.0)\n", - "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.12/dist-packages (from importlib-metadata>=6.8.0->litellm==1.75.0) (3.23.0)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.9.0)\n", - "Requirement already satisfied: jiter<1,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (0.13.0)\n", - "Requirement already satisfied: sniffio in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.3.1)\n", - "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (4.67.2)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n", - "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2025.11.3)\n", - "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2.32.4)\n", - "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (3.4.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (2.5.0)\n", - "Requirement already satisfied: huggingface-hub<2.0,>=0.16.4 in /usr/local/lib/python3.12/dist-packages (from tokenizers->litellm==1.75.0) (1.3.7)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (3.20.3)\n", - "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (2025.3.0)\n", - "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.2.0)\n", - "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.5.4)\n", - "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (0.21.1)\n", - "Downloading litellm-1.75.0-py3-none-any.whl (8.9 MB)\n", - "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m81.9 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: litellm\n", - "Successfully installed litellm-1.75.0\n" - ] - } - ], - "source": [ - "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n", - "!git clone --depth 1 --branch runner-foundation https://github.com/guru-code-expert/Trace-Bench.git\n", - "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n", - "\n", - "%cd Trace-Bench\n", - "\n", - "# System + Python deps\n", - "!apt-get update -y && apt-get install -y graphviz\n", - "!python -m pip install -U pip\n", - "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0" - ], - "id": "swOi3Bhtm0UQ" - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "a__iRJTHm0UR", - "outputId": "f48aba86-b779-4537-f5ce-8d5b2bdc4154", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "euYNX4m-m0Ty" + }, + "source": [ + "# Trace-Bench M1 — Minimal API Validation\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guru-code-expert/Trace-Bench/blob/m1/deliverable/notebooks/01_m1_minimal_api.ipynb)\n", + "\n", + "This notebook validates the **M1 contracts**: canonical artifacts, deterministic IDs, and minimal runnable coverage across benches.\n", + "\n", + "**Mode policy**: defaults to **real** (uses API key if present). If no key is found, falls back to **stub** with a clear warning and STUB label on outputs." + ], + "id": "euYNX4m-m0Ty" + }, { - "output_type": "stream", - "name": "stdout", - "text": [ - "=== List trainers ===\n", - "PrioritySearch\tavailable\n", - "GEPA-Base\tavailable\n", - "GEPA-UCB\tavailable\n", - "GEPA-Beam\tavailable\n", - "\n", - "=== Validate config (strict) ===\n", - "[OK] internal:code_param\n", - "[OK] internal:numeric_param\n", - "[OK] internal:multi_param\n", - "[OK] internal:non_trainable\n", - "[EXPECTED] internal:non_trainable: no_trainable_parameters\n", - "[OK] trace_examples:greeting_stub\n", - "[OK] llm4ad:circle_packing\n", - "[SKIP] veribench:smoke_placeholder: VeriBench tasks not yet wired: awaiting Trace team entrypoint/task list.\n", - "\n", - "[OK] matrix: 28 jobs expanded deterministically\n", - " job 6f3619dd9ae0: internal:code_param x PrioritySearch (seed=123)\n", - " job c486ba93400f: internal:code_param x GEPA-Base (seed=123)\n", - " job a84d2486d31a: internal:code_param x GEPA-UCB (seed=123)\n", - " job 8ecff95cfafa: internal:code_param x GEPA-Beam (seed=123)\n", - " job 778da61d2682: internal:numeric_param x PrioritySearch (seed=123)\n", - " job 4b3a7f322126: internal:numeric_param x GEPA-Base (seed=123)\n", - " job 4b9c7d66d866: internal:numeric_param x GEPA-UCB (seed=123)\n", - " job 54df742bb5e9: internal:numeric_param x GEPA-Beam (seed=123)\n", - " job 0bfef35f6ef3: internal:multi_param x PrioritySearch (seed=123)\n", - " job e06adbe6489b: internal:multi_param x GEPA-Base (seed=123)\n", - " job 8669d9b963d4: internal:multi_param x GEPA-UCB (seed=123)\n", - " job 90d23f88baf7: internal:multi_param x GEPA-Beam (seed=123)\n", - " job d6aa82e5d119: internal:non_trainable x PrioritySearch (seed=123)\n", - " job 4f655637a6dc: internal:non_trainable x GEPA-Base (seed=123)\n", - " job 85940a1b71e7: internal:non_trainable x GEPA-UCB (seed=123)\n", - " job dafcec9c13af: internal:non_trainable x GEPA-Beam (seed=123)\n", - " job e8e9938a4ef6: trace_examples:greeting_stub x PrioritySearch (seed=123)\n", - " job 4715e211f8a9: trace_examples:greeting_stub x GEPA-Base (seed=123)\n", - " job 8c4ec9f3e355: trace_examples:greeting_stub x GEPA-UCB (seed=123)\n", - " job 2f84751a35ad: trace_examples:greeting_stub x GEPA-Beam (seed=123)\n", - " job da0e8ae694f1: llm4ad:circle_packing x PrioritySearch (seed=123)\n", - " job 0865599891de: llm4ad:circle_packing x GEPA-Base (seed=123)\n", - " job d25dcdb59892: llm4ad:circle_packing x GEPA-UCB (seed=123)\n", - " job d985faad90f4: llm4ad:circle_packing x GEPA-Beam (seed=123)\n", - " job 364d89b28934: veribench:smoke_placeholder x PrioritySearch (seed=123)\n", - " job 721282ed015b: veribench:smoke_placeholder x GEPA-Base (seed=123)\n", - " job 5b657b995d7a: veribench:smoke_placeholder x GEPA-UCB (seed=123)\n", - " job 77b3e4cb5bf0: veribench:smoke_placeholder x GEPA-Beam (seed=123)\n", - "\n", - " tasks: ['internal:code_param', 'internal:multi_param', 'internal:non_trainable', 'internal:numeric_param', 'llm4ad:circle_packing', 'trace_examples:greeting_stub', 'veribench:smoke_placeholder']\n", - " trainers: ['GEPA-Base', 'GEPA-Beam', 'GEPA-UCB', 'PrioritySearch']\n", - "[OK] manifest written: runs/20260209-153344-8f7a72b4/meta/manifest.json\n", - "\n", - "=== Generate M1 run config (mode=real) ===\n", - "Config mode: real\n", - "\n", - "=== Run M1 validation ===\n", - "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", - "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", - "Epoch: 0. Iteration: 0\n", - "[Step 0] Test/test_score: 1.0\n", - "[Step 0] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", - "[Step 0] Update/n_iters: 0\n", - "[Step 0] Update/short_term_memory_size: 0\n", - "[Step 0] Update/long_term_memory_size: 2\n", - "[Step 0] Update/using_short_term_memory: False\n", - "[Step 0] Update/using_long_term_memory: True\n", - "[Step 0] Update/total_samples: 0\n", - "[Step 0] Update/best_candidate_priority: inf\n", - "[Step 0] Update/best_candidate_num_rollouts: 0\n", - "[Step 0] Update/num_exploration_candidates: 2\n", - "[Step 0] Update/exploration_candidates_mean_priority: inf\n", - "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", - "[Step 0] Sample/mean_score: 1.0\n", - "[Step 0] Sample/num_samples: 2\n", - "[Step 0] Sample/self.n_epochs: 0\n", - "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", - "[Step 0] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n", - "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n", - " return code\u001b[0m\n", - "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: 1.0\n", - "[Step 1] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", - "[Step 1] Update/n_iters: 1\n", - "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 1\n", - "[Step 1] Update/using_short_term_memory: False\n", - "[Step 1] Update/using_long_term_memory: True\n", - "[Step 1] Update/total_samples: 2\n", - "[Step 1] Update/best_candidate_priority: 1.0\n", - "[Step 1] Update/best_candidate_mean_score: 1.0\n", - "[Step 1] Update/best_candidate_num_rollouts: 2\n", - "[Step 1] Update/num_exploration_candidates: 1\n", - "[Step 1] Update/exploration_candidates_mean_priority: 1.0\n", - "[Step 1] Update/exploration_candidates_mean_score: 1.0\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", - "[Step 1] Sample/mean_score: 1.0\n", - "[Step 1] Sample/num_samples: 1\n", - "[Step 1] Sample/self.n_epochs: 1\n", - "[Step 1] \u001b[94mAlgo/Number of training samples: 3\u001b[0m\n", - "[Step 1] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n", - " return code\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: 1.0\u001b[0m\n", - "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", - "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", - "Epoch: 0. Iteration: 0\n", - "[Step 0] Test/test_score: -3.0\n", - "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n", - "[Step 0] Update/n_iters: 0\n", - "[Step 0] Update/short_term_memory_size: 0\n", - "[Step 0] Update/long_term_memory_size: 2\n", - "[Step 0] Update/using_short_term_memory: False\n", - "[Step 0] Update/using_long_term_memory: True\n", - "[Step 0] Update/total_samples: 0\n", - "[Step 0] Update/best_candidate_priority: inf\n", - "[Step 0] Update/best_candidate_num_rollouts: 0\n", - "[Step 0] Update/num_exploration_candidates: 2\n", - "[Step 0] Update/exploration_candidates_mean_priority: inf\n", - "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", - "[Step 0] Sample/mean_score: -3.0\n", - "[Step 0] Sample/num_samples: 2\n", - "[Step 0] Sample/self.n_epochs: 0\n", - "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", - "[Step 0] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n", - " return value\u001b[0m\n", - "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n", - "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: 0.0\n", - "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n", - "[Step 1] Update/n_iters: 1\n", - "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 3\n", - "[Step 1] Update/using_short_term_memory: False\n", - "[Step 1] Update/using_long_term_memory: True\n", - "[Step 1] Update/total_samples: 6\n", - "[Step 1] Update/best_candidate_priority: 0.0\n", - "[Step 1] Update/best_candidate_mean_score: 0.0\n", - "[Step 1] Update/best_candidate_num_rollouts: 2\n", - "[Step 1] Update/num_exploration_candidates: 2\n", - "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", - "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", - "[Step 1] Sample/mean_score: 0.0\n", - "[Step 1] Sample/num_samples: 2\n", - "[Step 1] Sample/self.n_epochs: 1\n", - "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n", - " return value\u001b[0m\n", - "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", - "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", - "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", - "Epoch: 0. Iteration: 0\n", - "[Step 0] Test/test_score: -1.0\n", - "[Step 0] \u001b[94mAlgo/Average train score: -1.0\u001b[0m\n", - "[Step 0] Update/n_iters: 0\n", - "[Step 0] Update/short_term_memory_size: 0\n", - "[Step 0] Update/long_term_memory_size: 2\n", - "[Step 0] Update/using_short_term_memory: False\n", - "[Step 0] Update/using_long_term_memory: True\n", - "[Step 0] Update/total_samples: 0\n", - "[Step 0] Update/best_candidate_priority: inf\n", - "[Step 0] Update/best_candidate_num_rollouts: 0\n", - "[Step 0] Update/num_exploration_candidates: 2\n", - "[Step 0] Update/exploration_candidates_mean_priority: inf\n", - "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", - "[Step 0] Sample/mean_score: -1.0\n", - "[Step 0] Sample/num_samples: 2\n", - "[Step 0] Sample/self.n_epochs: 0\n", - "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", - "[Step 0] \u001b[91mParameter/float:2: 1.0\u001b[0m\n", - "[Step 0] \u001b[91mParameter/float:3: 1.0\u001b[0m\n", - "[Step 0] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", - " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", - "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: 0.0\n", - "[Step 1] \u001b[94mAlgo/Average train score: -0.5\u001b[0m\n", - "[Step 1] Update/n_iters: 1\n", - "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 5\n", - "[Step 1] Update/using_short_term_memory: False\n", - "[Step 1] Update/using_long_term_memory: True\n", - "[Step 1] Update/total_samples: 6\n", - "[Step 1] Update/best_candidate_priority: 0.0\n", - "[Step 1] Update/best_candidate_mean_score: 0.0\n", - "[Step 1] Update/best_candidate_num_rollouts: 1\n", - "[Step 1] Update/num_exploration_candidates: 2\n", - "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", - "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", - "[Step 1] Sample/mean_score: 0.0\n", - "[Step 1] Sample/num_samples: 2\n", - "[Step 1] Sample/self.n_epochs: 1\n", - "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/float:2: 1.5\u001b[0m\n", - "[Step 1] \u001b[91mParameter/float:3: 1.5\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", - " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", - "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", - "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", - "Epoch: 0. Iteration: 0\n", - "[Step 0] Test/test_score: -1000000.0\n", - "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", - "[Step 0] Update/n_iters: 0\n", - "[Step 0] Update/short_term_memory_size: 0\n", - "[Step 0] Update/long_term_memory_size: 2\n", - "[Step 0] Update/using_short_term_memory: False\n", - "[Step 0] Update/using_long_term_memory: True\n", - "[Step 0] Update/total_samples: 0\n", - "[Step 0] Update/best_candidate_priority: inf\n", - "[Step 0] Update/best_candidate_num_rollouts: 0\n", - "[Step 0] Update/num_exploration_candidates: 2\n", - "[Step 0] Update/exploration_candidates_mean_priority: inf\n", - "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", - "[Step 0] Sample/mean_score: -1000000.0\n", - "[Step 0] Sample/num_samples: 2\n", - "[Step 0] Sample/self.n_epochs: 0\n", - "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", - "[Step 0] \u001b[91mParameter/__code:3: import numpy as np\n", - "import math\n", - "def pack_circles(n: int) -> np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - " \n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - " \n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - "\n", - " grid_size = int(np.ceil(np.sqrt(n)))\n", - " radius = 0.5 / grid_size\n", - "\n", - " circles = []\n", - " for i in range(n):\n", - " row = i // grid_size\n", - " col = i % grid_size\n", - " x = (col + 0.5) / grid_size\n", - " y = (row + 0.5) / grid_size\n", - " circles.append([x, y, radius])\n", - "\n", - " return np.array(circles)\u001b[0m\n", - "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: 1.375582371483138\n", - "[Step 1] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", - "[Step 1] Update/n_iters: 1\n", - "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 5\n", - "[Step 1] Update/using_short_term_memory: False\n", - "[Step 1] Update/using_long_term_memory: True\n", - "[Step 1] Update/total_samples: 6\n", - "[Step 1] Update/best_candidate_priority: 1.375582371483138\n", - "[Step 1] Update/best_candidate_mean_score: 1.375582371483138\n", - "[Step 1] Update/best_candidate_num_rollouts: 1\n", - "[Step 1] Update/num_exploration_candidates: 2\n", - "[Step 1] Update/exploration_candidates_mean_priority: 1.0407921408122753\n", - "[Step 1] Update/exploration_candidates_mean_score: 1.0407921408122753\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", - "[Step 1] Sample/mean_score: -1000000.0\n", - "[Step 1] Sample/num_samples: 2\n", - "[Step 1] Sample/self.n_epochs: 1\n", - "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code:3: import numpy as np\n", - "import random\n", - "\n", - "def pack_circles(n: int) -> np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - " \n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - " \n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - "\n", - " random.seed(2025)\n", - " np.random.seed(2025)\n", - "\n", - " circles = []\n", - " radii = np.random.uniform(0.01, 0.1, size=n) # Random radii between 0.01 and 0.1\n", - "\n", - " for _ in range(n):\n", - " placed = False\n", - " while not placed:\n", - " radius = np.random.choice(radii)\n", - " x = np.random.uniform(radius, 1 - radius)\n", - " y = np.random.uniform(radius, 1 - radius)\n", - " overlap = False\n", - " \n", - " # Check for overlap\n", - " for circle in circles:\n", - " if np.sqrt((circle[0] - x) ** 2 + (circle[1] - y) ** 2) < (circle[2] + radius):\n", - " overlap = True\n", - " break\n", - " \n", - " if not overlap:\n", - " circles.append([x, y, radius])\n", - " placed = True\n", - "\n", - " return np.array(circles)\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: -1000000.0\u001b[0m\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "u5DVjcAAm0UH" + }, + "source": [ + "## Expected Outputs\n", + "\n", + "- A new `runs//` folder with `meta/` + `jobs/` layout.\n", + "- `meta/config.snapshot.yaml`, `meta/manifest.json`, `meta/env.json` exist.\n", + "- `results.csv` contains `status` values (`ok`/`failed`/`skipped`).\n", + "- Internal non-trainable job shows `status=failed` with reason.\n", + "- If running in **real** mode, artifacts show `mode=real` and LLM4AD task produces a score.\n", + "- **2x2 matrix smoke**: `results.csv` with exactly 4 rows from 2 tasks x 2 trainers x 1 seed." + ], + "id": "u5DVjcAAm0UH" }, { - "output_type": "stream", - "name": "stderr", - "text": [ - "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_run.yaml <=3.10 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.13.3)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.3.1)\n", + "Requirement already satisfied: httpx>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.28.1)\n", + "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.7.1)\n", + "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.1.6)\n", + "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (4.26.0)\n", + "Requirement already satisfied: openai>=1.68.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.17.0)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.12.3)\n", + "Requirement already satisfied: python-dotenv>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (1.2.1)\n", + "Requirement already satisfied: tiktoken>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.12.0)\n", + "Requirement already satisfied: tokenizers in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.22.2)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2<4.0.0,>=3.1.2->litellm==1.75.0) (3.0.3)\n", + "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (25.4.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (2025.9.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.37.0)\n", + "Requirement already satisfied: rpds-py>=0.25.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.30.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (2.41.4)\n", + "Requirement already satisfied: typing-extensions>=4.14.1 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (4.15.0)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.4.2)\n", + "Requirement already satisfied: iniconfig>=1 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.3.0)\n", + "Requirement already satisfied: packaging>=20 in /usr/local/lib/python3.12/dist-packages (from pytest) (26.0)\n", + "Requirement already satisfied: pluggy<2,>=1.5 in /usr/local/lib/python3.12/dist-packages (from pytest) (1.6.0)\n", + "Requirement already satisfied: pygments>=2.7.2 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.19.2)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", + "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (2.9.0.post0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.4.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.8.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (6.7.1)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (0.4.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.22.0)\n", + "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.12/dist-packages (from yarl<2.0,>=1.17.0->aiohttp>=3.10->litellm==1.75.0) (3.11)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (4.12.1)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (2026.1.4)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (1.0.9)\n", + "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx>=0.23.0->litellm==1.75.0) (0.16.0)\n", + "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.12/dist-packages (from importlib-metadata>=6.8.0->litellm==1.75.0) (3.23.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.9.0)\n", + "Requirement already satisfied: jiter<1,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (0.13.0)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.3.1)\n", + "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (4.67.3)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n", + "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2025.11.3)\n", + "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2.32.4)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (3.4.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (2.5.0)\n", + "Requirement already satisfied: huggingface-hub<2.0,>=0.16.4 in /usr/local/lib/python3.12/dist-packages (from tokenizers->litellm==1.75.0) (1.4.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (3.20.3)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (2025.3.0)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.2.0)\n", + "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.5.4)\n", + "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (0.21.1)\n", + "Downloading litellm-1.75.0-py3-none-any.whl (8.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m26.1 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: litellm\n", + "Successfully installed litellm-1.75.0\n" + ] + } + ], + "source": [ + "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n", + "!git clone --depth 1 --branch runner-foundation https://github.com/guru-code-expert/Trace-Bench.git\n", + "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n", + "\n", + "%cd Trace-Bench\n", + "\n", + "# System + Python deps\n", + "!apt-get update -y && apt-get install -y graphviz\n", + "!python -m pip install -U pip\n", + "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0" + ], + "id": "swOi3Bhtm0UQ" }, { - "output_type": "execute_result", - "data": { - "text/plain": [ - " run_id job_id task_id suite \\\n", - "0 20260209-153346-0daa4bb9 6f3619dd9ae0 internal:code_param internal \n", - "1 20260209-153346-0daa4bb9 c486ba93400f internal:code_param internal \n", - "2 20260209-153346-0daa4bb9 778da61d2682 internal:numeric_param internal \n", - "3 20260209-153346-0daa4bb9 4b3a7f322126 internal:numeric_param internal \n", - "4 20260209-153346-0daa4bb9 0bfef35f6ef3 internal:multi_param internal \n", - "\n", - " trainer_id seed status score_initial score_final score_best \\\n", - "0 PrioritySearch 123 ok 1.0 1.0 1.0 \n", - "1 GEPA-Base 123 ok 1.0 1.0 1.0 \n", - "2 PrioritySearch 123 ok -3.0 -0.0 -0.0 \n", - "3 GEPA-Base 123 ok -3.0 -0.0 -0.0 \n", - "4 PrioritySearch 123 ok -1.0 -0.0 -0.0 \n", - "\n", - " time_seconds resolved_trainer_kwargs \\\n", - "0 10.507114 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", - "1 1.279633 {\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub... \n", - "2 4.215786 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", - "3 3.031100 {\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub... \n", - "4 3.620341 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", - "\n", - " resolved_optimizer_kwargs eval_kwargs \\\n", - "0 {\"memory_size\": 5, \"objective\": \"Match the tar... {\"timeout_seconds\": 10} \n", - "1 {\"memory_size\": 5, \"objective\": \"Match the tar... {\"timeout_seconds\": 10} \n", - "2 {\"memory_size\": 5, \"objective\": \"Match the num... {\"timeout_seconds\": 10} \n", - "3 {\"memory_size\": 5, \"objective\": \"Match the num... {\"timeout_seconds\": 10} \n", - "4 {\"memory_size\": 5, \"objective\": \"Make a+b matc... {\"timeout_seconds\": 10} \n", - "\n", - " feedback tb_logdir \n", - "0 Correct jobs/6f3619dd9ae0/tb \n", - "1 Correct jobs/c486ba93400f/tb \n", - "2 target=3.0 jobs/778da61d2682/tb \n", - "3 target=3.0 jobs/4b3a7f322126/tb \n", - "4 target=3.0 jobs/0bfef35f6ef3/tb " + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "a__iRJTHm0UR", + "outputId": "e2225467-6561-4c48-a5a4-040d41fa9b69", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "=== List trainers ===\n", + "PrioritySearch\tavailable\n", + "GEPA-Base\tavailable\n", + "GEPA-UCB\tavailable\n", + "GEPA-Beam\tavailable\n", + "\n", + "=== Validate config (strict) ===\n", + "\n", + "=== Generate M1 run config (mode=real) ===\n", + "Config mode: real\n", + "\n", + "=== Run M1 validation ===\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: 1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: 1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n", + " return code\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 1\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 2\n", + "[Step 1] Update/best_candidate_priority: 1.0\n", + "[Step 1] Update/best_candidate_mean_score: 1.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 1\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 1.0\n", + "[Step 1] Sample/num_samples: 1\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 3\u001b[0m\n", + "[Step 1] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n", + " return code\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -3.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -3.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 3\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:2: 1.0\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:3: 1.0\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", + " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -0.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 3\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 5\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.5\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:2: 1.0\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:3: 2.0\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", + " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: 1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: 1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code3_copy:0: def compose(self, greeting, name: str):\n", + " greeting_value = getattr(greeting, \"data\", greeting)\n", + " return f\"{greeting_value}, {name}!\"\u001b[0m\n", + "[Step 0] \u001b[91mParameter/str:22: Hello\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 1\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 2\n", + "[Step 1] Update/best_candidate_priority: 1.0\n", + "[Step 1] Update/best_candidate_mean_score: 1.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 1\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 1.0\n", + "[Step 1] Sample/num_samples: 1\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 3\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code3_copy:0: def compose(self, greeting, name: str):\n", + " greeting_value = getattr(greeting, \"data\", greeting)\n", + " return f\"{greeting_value}, {name}!\"\u001b[0m\n", + "[Step 1] \u001b[91mParameter/str:22: Hello\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1000000.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1000000.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code:4: import numpy as np\n", + "import math\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " grid_size = int(np.ceil(np.sqrt(n)))\n", + " radius = 0.5 / grid_size\n", + "\n", + " circles = []\n", + " for i in range(n):\n", + " row = i // grid_size\n", + " col = i % grid_size\n", + " x = (col + 0.5) / grid_size\n", + " y = (row + 0.5) / grid_size\n", + " circles.append([x, y, radius])\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.7469557727446884\n", + "[Step 1] \u001b[94mAlgo/Average train score: -499999.6367605793\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.7469557727446884\n", + "[Step 1] Update/best_candidate_mean_score: 0.7469557727446884\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.7264788414430507\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.7264788414430507\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: 0.7264788414430507\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code:4: import numpy as np\n", + "import math\n", + "\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + "\n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " np.random.seed(2025) # Set random seed for reproducibility\n", + " circles = []\n", + " total_radius = 0.0\n", + "\n", + " for _ in range(n):\n", + " radius = np.random.uniform(0.01, 0.05) # Dynamically assign radius\n", + " while True:\n", + " x = np.random.uniform(radius, 1 - radius)\n", + " y = np.random.uniform(radius, 1 - radius)\n", + " overlap = False\n", + " for circle in circles:\n", + " if math.hypot(circle[0] - x, circle[1] - y) < (circle[2] + radius):\n", + " overlap = True\n", + " break\n", + " if not overlap:\n", + " circles.append([x, y, radius])\n", + " total_radius += radius\n", + " break\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.5\u001b[0m\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "usage: trace-bench [-h] {list-tasks,list-trainers,validate,run,ui} ...\n", + "trace-bench: error: unrecognized arguments: --runs-dir /content/drive/MyDrive/bench/2026-02-11/trace_bench\n", + "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
run_idjob_idtask_idsuitetrainer_idseedstatusscore_initialscore_finalscore_besttime_secondsresolved_trainer_kwargsresolved_optimizer_kwargseval_kwargsfeedbacktb_logdir
020260209-153346-0daa4bb96f3619dd9ae0internal:code_paraminternalPrioritySearch123ok1.01.01.010.507114{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/6f3619dd9ae0/tb
120260209-153346-0daa4bb9c486ba93400finternal:code_paraminternalGEPA-Base123ok1.01.01.01.279633{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/c486ba93400f/tb
220260209-153346-0daa4bb9778da61d2682internal:numeric_paraminternalPrioritySearch123ok-3.0-0.0-0.04.215786{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/778da61d2682/tb
320260209-153346-0daa4bb94b3a7f322126internal:numeric_paraminternalGEPA-Base123ok-3.0-0.0-0.03.031100{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/4b3a7f322126/tb
420260209-153346-0daa4bb90bfef35f6ef3internal:multi_paraminternalPrioritySearch123ok-1.0-0.0-0.03.620341{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Make a+b matc...{\"timeout_seconds\": 10}target=3.0jobs/0bfef35f6ef3/tb
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n" + "source": [ + "%%bash\n", + "cd /content/Trace-Bench\n", + "\n", + "echo \"=== List trainers ===\"\n", + "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench list-trainers\n", + "\n", + "echo \"\"\n", + "echo \"=== Validate config (strict) ===\"\n", + "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench validate --config configs/m1_validation.yaml --strict --runs-dir \"$RUNS_DIR\"\n", + "\n", + "echo \"\"\n", + "echo \"=== Generate M1 run config (mode=$TB_MODE) ===\"\n", + "cat > /content/m1_run.yaml < np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - " \n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - " \n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - "\n", - " grid_size = int(np.ceil(np.sqrt(n)))\n", - " radius = 0.5 / grid_size\n", - "\n", - " circles = []\n", - " for i in range(n):\n", - " row = i // grid_size\n", - " col = i % grid_size\n", - " x = (col + 0.5) / grid_size\n", - " y = (row + 0.5) / grid_size\n", - " circles.append([x, y, radius])\n", - "\n", - " return np.array(circles)\u001b[0m\n", - "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: 0.6499617928349034\n", - "[Step 1] \u001b[94mAlgo/Average train score: -749999.8375095518\u001b[0m\n", - "[Step 1] Update/n_iters: 1\n", - "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 5\n", - "[Step 1] Update/using_short_term_memory: False\n", - "[Step 1] Update/using_long_term_memory: True\n", - "[Step 1] Update/total_samples: 6\n", - "[Step 1] Update/best_candidate_priority: 0.6499617928349034\n", - "[Step 1] Update/best_candidate_mean_score: 0.6499617928349034\n", - "[Step 1] Update/best_candidate_num_rollouts: 1\n", - "[Step 1] Update/num_exploration_candidates: 2\n", - "[Step 1] Update/exploration_candidates_mean_priority: -499999.67501910357\n", - "[Step 1] Update/exploration_candidates_mean_score: -499999.67501910357\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", - "[Step 1] Sample/mean_score: -499999.67501910357\n", - "[Step 1] Sample/num_samples: 2\n", - "[Step 1] Sample/self.n_epochs: 1\n", - "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code:1: import numpy as np\n", - "import math\n", - "\n", - "def pack_circles(n: int) -> np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - "\n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - "\n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - " np.random.seed(2025)\n", - " \n", - " circles = []\n", - " for _ in range(n):\n", - " radius = np.random.rand() * 0.05 # Variable radius, capped to keep circles small\n", - " x, y = np.random.rand(2) * (1 - 2 * radius) + radius # Ensures circles fit in unit square\n", - "\n", - " # Check for overlapping\n", - " while any(np.linalg.norm([x - circle[0], y - circle[1]]) < (radius + circle[2]) for circle in circles):\n", - " x, y = np.random.rand(2) * (1 - 2 * radius) + radius # Reposition if overlap detected\n", - "\n", - " circles.append([x, y, radius])\n", - " \n", - " return np.array(circles)\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: 1.4689943904012859\u001b[0m\n" - ] + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "ckY1HmQam0UU", + "outputId": "556ea369-3506-4d2e-db3c-da31382e8f7d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 764 + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Run dir: /content/drive/MyDrive/bench/2026-02-11/trace_bench/20260211-055528-de435ae5\n", + "run_id: 20260211-055528-de435ae5\n", + "runs_dir: /content/drive/MyDrive/bench/2026-02-11/trace_bench\n", + "mode: real\n", + "seeds:\n", + "- 123\n", + "max_workers: 1\n", + "fail_fast: false\n", + "tasks:\n", + "- id: internal:code_param\n", + " eval_kwargs:\n", + " timeout_seconds: 10\n", + "- id: internal:numeric_param\n", + " eval_kwargs:\n", + " timeout_seconds: 10\n", + "- id: internal:multi_param\n", + " eval_kwargs:\n", + " timeout_seconds: 10\n", + "- id: internal:non_trainable\n", + " eval_kwargs:\n", + "Jobs in manifest: 14\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " run_id job_id task_id suite \\\n", + "0 20260211-055528-de435ae5 6f3619dd9ae0 internal:code_param internal \n", + "1 20260211-055528-de435ae5 c486ba93400f internal:code_param internal \n", + "2 20260211-055528-de435ae5 778da61d2682 internal:numeric_param internal \n", + "3 20260211-055528-de435ae5 4b3a7f322126 internal:numeric_param internal \n", + "4 20260211-055528-de435ae5 0bfef35f6ef3 internal:multi_param internal \n", + "\n", + " trainer_id seed status score_initial score_final score_best \\\n", + "0 PrioritySearch 123 ok 1.0 1.0 1.0 \n", + "1 GEPA-Base 123 ok 1.0 1.0 1.0 \n", + "2 PrioritySearch 123 ok -3.0 -0.0 -0.0 \n", + "3 GEPA-Base 123 ok -3.0 -0.0 -0.0 \n", + "4 PrioritySearch 123 ok -1.0 -0.0 -0.0 \n", + "\n", + " time_seconds resolved_trainer_kwargs \\\n", + "0 8.531946 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", + "1 3.063799 {\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub... \n", + "2 3.603461 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", + "3 4.485549 {\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub... \n", + "4 4.149766 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", + "\n", + " resolved_optimizer_kwargs eval_kwargs \\\n", + "0 {\"memory_size\": 5, \"objective\": \"Match the tar... {\"timeout_seconds\": 10} \n", + "1 {\"memory_size\": 5, \"objective\": \"Match the tar... {\"timeout_seconds\": 10} \n", + "2 {\"memory_size\": 5, \"objective\": \"Match the num... {\"timeout_seconds\": 10} \n", + "3 {\"memory_size\": 5, \"objective\": \"Match the num... {\"timeout_seconds\": 10} \n", + "4 {\"memory_size\": 5, \"objective\": \"Make a+b matc... {\"timeout_seconds\": 10} \n", + "\n", + " feedback tb_logdir \n", + "0 Correct jobs/6f3619dd9ae0/tb \n", + "1 Correct jobs/c486ba93400f/tb \n", + "2 target=3.0 jobs/778da61d2682/tb \n", + "3 target=3.0 jobs/4b3a7f322126/tb \n", + "4 target=3.0 jobs/0bfef35f6ef3/tb " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
run_idjob_idtask_idsuitetrainer_idseedstatusscore_initialscore_finalscore_besttime_secondsresolved_trainer_kwargsresolved_optimizer_kwargseval_kwargsfeedbacktb_logdir
020260211-055528-de435ae56f3619dd9ae0internal:code_paraminternalPrioritySearch123ok1.01.01.08.531946{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/6f3619dd9ae0/tb
120260211-055528-de435ae5c486ba93400finternal:code_paraminternalGEPA-Base123ok1.01.01.03.063799{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/c486ba93400f/tb
220260211-055528-de435ae5778da61d2682internal:numeric_paraminternalPrioritySearch123ok-3.0-0.0-0.03.603461{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/778da61d2682/tb
320260211-055528-de435ae54b3a7f322126internal:numeric_paraminternalGEPA-Base123ok-3.0-0.0-0.04.485549{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/4b3a7f322126/tb
420260211-055528-de435ae50bfef35f6ef3internal:multi_paraminternalPrioritySearch123ok-1.0-0.0-0.04.149766{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Make a+b matc...{\"timeout_seconds\": 10}target=3.0jobs/0bfef35f6ef3/tb
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 14,\n \"fields\": [\n {\n \"column\": \"run_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"20260211-055528-de435ae5\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"job_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"4715e211f8a9\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"internal:code_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"trace_examples\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_initial\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 421636.810540172,\n \"min\": -1000000.0,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_final\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47782940218389114,\n \"min\": -0.0,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47782940218389114,\n \"min\": -0.0,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"time_seconds\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.637736580815169,\n \"min\": 3.3e-05,\n \"max\": 21.165263,\n \"num_unique_values\": 13,\n \"samples\": [\n 7.290935\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_trainer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"{\\\"merge_every\\\": 2, \\\"num_iters\\\": 1, \\\"pareto_subset_size\\\": 2, \\\"train_batch_size\\\": 2}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_optimizer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"{\\\"memory_size\\\": 5, \\\"objective\\\": \\\"Match the target code exactly.\\\"}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"eval_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"{\\\"timeout_seconds\\\": 10}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"feedback\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Correct\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tb_logdir\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"jobs/4715e211f8a9/tb\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "# Inspect latest run artifacts\n", + "import pathlib, json, pandas as pd\n", + "\n", + "runs_root = pathlib.Path(RUNS_DIR)\n", + "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", + "\n", + "run_dir = None\n", + "for p in reversed(candidates):\n", + " if (p / \"meta\" / \"config.snapshot.yaml\").exists():\n", + " run_dir = p\n", + " break\n", + "\n", + "if run_dir is None:\n", + " for p in reversed(candidates):\n", + " if (p / \"config.snapshot.yaml\").exists():\n", + " run_dir = p\n", + " break\n", + "\n", + "if run_dir is None:\n", + " raise FileNotFoundError(\"No run folder with config snapshot found under RUNS_DIR\")\n", + "\n", + "print(\"Run dir:\", run_dir)\n", + "\n", + "config_path = run_dir / \"meta\" / \"config.snapshot.yaml\"\n", + "env_path = run_dir / \"meta\" / \"env.json\"\n", + "manifest_path = run_dir / \"meta\" / \"manifest.json\"\n", + "\n", + "if not config_path.exists():\n", + " config_path = run_dir / \"config.snapshot.yaml\"\n", + " env_path = run_dir / \"env.json\"\n", + "\n", + "config_text = config_path.read_text()\n", + "print(config_text[:400])\n", + "\n", + "if manifest_path.exists():\n", + " manifest = json.loads(manifest_path.read_text())\n", + " print(\"Jobs in manifest:\", len(manifest.get(\"jobs\", [])))\n", + "\n", + "df = pd.read_csv(run_dir / \"results.csv\")\n", + "df.head()\n" + ], + "id": "ckY1HmQam0UU" }, { - "output_type": "stream", - "name": "stderr", - "text": [ - "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_matrix.yaml < np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " grid_size = int(np.ceil(np.sqrt(n)))\n", + " radius = 0.5 / grid_size\n", + "\n", + " circles = []\n", + " for i in range(n):\n", + " row = i // grid_size\n", + " col = i % grid_size\n", + " x = (col + 0.5) / grid_size\n", + " y = (row + 0.5) / grid_size\n", + " circles.append([x, y, radius])\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.3000000000000003\n", + "[Step 1] \u001b[94mAlgo/Average train score: -499999.545\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 1.3000000000000003\n", + "[Step 1] Update/best_candidate_mean_score: 1.3000000000000003\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.9100000000000001\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.9100000000000001\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: 0.9100000000000001\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code:1: import numpy as np\n", + "import random\n", + "\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + " random.seed(2025)\n", + " np.random.seed(2025)\n", + " \n", + " circles = []\n", + " radius = 0.05 # Starting radius\n", + "\n", + " for _ in range(n):\n", + " while True:\n", + " x = np.random.uniform(radius, 1 - radius)\n", + " y = np.random.uniform(radius, 1 - radius)\n", + " overlap = False\n", + " \n", + " for cx, cy, r in circles:\n", + " distance = np.sqrt((cx - x) ** 2 + (cy - y) ** 2)\n", + " if distance < (r + radius):\n", + " overlap = True\n", + " break\n", + " \n", + " if not overlap:\n", + " circles.append([x, y, radius])\n", + " break\n", + " \n", + " return np.array(circles)\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: -1000000.0\u001b[0m\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_matrix.yaml <\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
task_idsuitetrainer_idseedstatusscore_best
0internal:numeric_paraminternalPrioritySearch123ok-0.0
1internal:numeric_paraminternalGEPA-Base123ok-0.0
2llm4ad:circle_packingllm4adPrioritySearch123ok1.3
3llm4ad:circle_packingllm4adGEPA-Base123ok-1000000.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df[[\\\"task_id\\\", \\\"suite\\\", \\\"trainer_id\\\", \\\"seed\\\", \\\"status\\\", \\\"score_best\\\"]]\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad:circle_packing\",\n \"internal:numeric_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad\",\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\",\n \"PrioritySearch\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 500000.2166670422,\n \"min\": -1000000.0,\n \"max\": 1.3000000000000005,\n \"num_unique_values\": 3,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 6 + } ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
task_idsuitetrainer_idseedstatusscore_best
0internal:numeric_paraminternalPrioritySearch123ok-0.000000
1internal:numeric_paraminternalGEPA-Base123ok-0.000000
2llm4ad:circle_packingllm4adPrioritySearch123ok0.649962
3llm4ad:circle_packingllm4adGEPA-Base123ok1.468994
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - "
\n" + "source": [ + "# Verify 2x2 matrix: exactly 4 rows in results.csv\n", + "import json, pathlib, pandas as pd\n", + "\n", + "runs_root = pathlib.Path(RUNS_DIR)\n", + "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", + "\n", + "matrix_dir = None\n", + "for p in reversed(candidates):\n", + " summary_path = p / \"summary.json\"\n", + " if not summary_path.exists():\n", + " continue\n", + " try:\n", + " summary = json.loads(summary_path.read_text())\n", + " except Exception:\n", + " continue\n", + " if summary.get(\"total_jobs\") == 4:\n", + " matrix_dir = p\n", + " break\n", + "\n", + "if matrix_dir is None:\n", + " raise FileNotFoundError(\"No matrix run with total_jobs==4 found. Re-run the matrix cell.\")\n", + "\n", + "print(\"Matrix run dir:\", matrix_dir)\n", + "\n", + "df = pd.read_csv(matrix_dir / \"results.csv\")\n", + "print(f\"\\nresults.csv rows: {len(df)} (expected: 4)\")\n", + "assert len(df) == 4, f\"Expected 4 rows, got {len(df)}\"\n", + "\n", + "summary = json.loads((matrix_dir / \"summary.json\").read_text())\n", + "print(f\"summary.json: {summary}\")\n", + "assert summary.get(\"total_jobs\") == 4\n", + "\n", + "print(\"\\n--- Matrix results ---\")\n", + "df[[\"task_id\", \"suite\", \"trainer_id\", \"seed\", \"status\", \"score_best\"]]\n" ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "summary": "{\n \"name\": \"df[[\\\"task_id\\\", \\\"suite\\\", \\\"trainer_id\\\", \\\"seed\\\", \\\"status\\\", \\\"score_best\\\"]]\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad:circle_packing\",\n \"internal:numeric_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad\",\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\",\n \"PrioritySearch\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.697113339555075,\n \"min\": -0.0,\n \"max\": 1.468994390401286,\n \"num_unique_values\": 3,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" - } - }, - "metadata": {}, - "execution_count": 6 + "id": "W18tGXfYm0UZ" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10" + }, + "colab": { + "provenance": [] } - ], - "source": [ - "# Verify 2x2 matrix: exactly 4 rows in results.csv\n", - "import json, pathlib, pandas as pd\n", - "\n", - "runs_root = pathlib.Path(RUNS_DIR)\n", - "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", - "\n", - "matrix_dir = None\n", - "for p in reversed(candidates):\n", - " summary_path = p / \"summary.json\"\n", - " if not summary_path.exists():\n", - " continue\n", - " try:\n", - " summary = json.loads(summary_path.read_text())\n", - " except Exception:\n", - " continue\n", - " if summary.get(\"total_jobs\") == 4:\n", - " matrix_dir = p\n", - " break\n", - "\n", - "if matrix_dir is None:\n", - " raise FileNotFoundError(\"No matrix run with total_jobs==4 found. Re-run the matrix cell.\")\n", - "\n", - "print(\"Matrix run dir:\", matrix_dir)\n", - "\n", - "df = pd.read_csv(matrix_dir / \"results.csv\")\n", - "print(f\"\\nresults.csv rows: {len(df)} (expected: 4)\")\n", - "assert len(df) == 4, f\"Expected 4 rows, got {len(df)}\"\n", - "\n", - "summary = json.loads((matrix_dir / \"summary.json\").read_text())\n", - "print(f\"summary.json: {summary}\")\n", - "assert summary.get(\"total_jobs\") == 4\n", - "\n", - "print(\"\\n--- Matrix results ---\")\n", - "df[[\"task_id\", \"suite\", \"trainer_id\", \"seed\", \"status\", \"score_best\"]]\n" - ], - "id": "W18tGXfYm0UZ" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.10" }, - "colab": { - "provenance": [] - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file From 61713b9d7ad3ffdcb94feeca0ca7634bc6cfeb7c Mon Sep 17 00:00:00 2001 From: Asad Date: Wed, 11 Feb 2026 15:47:06 +0500 Subject: [PATCH 5/8] Revert "Update 01_m1_minimal_api.ipynb" This reverts commit 51622f25c26a37ff1832a79fad3bc03438f3a262. --- notebooks/01_m1_minimal_api.ipynb | 3072 ++++++++++++++--------------- 1 file changed, 1525 insertions(+), 1547 deletions(-) diff --git a/notebooks/01_m1_minimal_api.ipynb b/notebooks/01_m1_minimal_api.ipynb index d6114aa..4d8670c 100644 --- a/notebooks/01_m1_minimal_api.ipynb +++ b/notebooks/01_m1_minimal_api.ipynb @@ -1,1567 +1,1545 @@ { - "cells": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "euYNX4m-m0Ty" + }, + "source": [ + "# Trace-Bench M1 \u2014 Minimal API Validation\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guru-code-expert/Trace-Bench/blob/m1/deliverable/notebooks/01_m1_minimal_api.ipynb)\n", + "\n", + "This notebook validates the **M1 contracts**: canonical artifacts, deterministic IDs, and minimal runnable coverage across benches.\n", + "\n", + "**Mode policy**: defaults to **real** (uses API key if present). If no key is found, falls back to **stub** with a clear warning and STUB label on outputs." + ], + "id": "euYNX4m-m0Ty" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u5DVjcAAm0UH" + }, + "source": [ + "## Expected Outputs\n", + "\n", + "- A new `runs//` folder with `meta/` + `jobs/` layout.\n", + "- `meta/config.snapshot.yaml`, `meta/manifest.json`, `meta/env.json` exist.\n", + "- `results.csv` contains `status` values (`ok`/`failed`/`skipped`).\n", + "- Internal non-trainable job shows `status=failed` with reason.\n", + "- If running in **real** mode, artifacts show `mode=real` and LLM4AD task produces a score.\n", + "- **2x2 matrix smoke**: `results.csv` with exactly 4 rows from 2 tasks x 2 trainers x 1 seed." + ], + "id": "u5DVjcAAm0UH" + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "8D3DGyVXm0UJ", + "outputId": "aadad0ba-037c-4ffc-8d5a-4c55fb9d0d3f", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "euYNX4m-m0Ty" - }, - "source": [ - "# Trace-Bench M1 — Minimal API Validation\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guru-code-expert/Trace-Bench/blob/m1/deliverable/notebooks/01_m1_minimal_api.ipynb)\n", - "\n", - "This notebook validates the **M1 contracts**: canonical artifacts, deterministic IDs, and minimal runnable coverage across benches.\n", - "\n", - "**Mode policy**: defaults to **real** (uses API key if present). If no key is found, falls back to **stub** with a clear warning and STUB label on outputs." - ], - "id": "euYNX4m-m0Ty" - }, + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n", + "Runs dir: /content/drive/MyDrive/bench/2026-02-09/trace_bench\n", + "API key found \u2014 running in REAL mode (model: gpt-4o-mini)\n", + "\n", + "Mode: real\n" + ] + } + ], + "source": [ + "# Mount Drive (optional) + compute persistent runs_dir + detect API key\n", + "from datetime import date\n", + "from pathlib import Path\n", + "import os\n", + "\n", + "try:\n", + " from google.colab import drive\n", + " drive.mount(\"/content/drive\")\n", + "except Exception:\n", + " pass\n", + "\n", + "\n", + "def bench_dir(project=\"bench\", sub=\"trace_bench\", local=\"/content/bench\"):\n", + " drive_root = Path(\"/content/drive/MyDrive\")\n", + " root = drive_root if drive_root.is_dir() else Path(local)\n", + " out = root / project / date.today().isoformat() / sub\n", + " out.mkdir(parents=True, exist_ok=True)\n", + " return str(out)\n", + "\n", + "RUNS_DIR = bench_dir()\n", + "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n", + "print(\"Runs dir:\", RUNS_DIR)\n", + "\n", + "# --- Auto-detect API key (real mode by default) ---\n", + "API_KEY = os.environ.get(\"OPENROUTER_API_KEY\", \"\")\n", + "if not API_KEY:\n", + " try:\n", + " from google.colab import userdata\n", + " API_KEY = userdata.get(\"OPENROUTER_API_KEY\") or \"\"\n", + " except Exception:\n", + " pass\n", + "\n", + "if API_KEY:\n", + " os.environ[\"OPENROUTER_API_KEY\"] = API_KEY\n", + " os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n", + " os.environ[\"TRACE_LITELLM_MODEL\"] = \"openrouter/openai/gpt-4o-mini\"\n", + " MODE = \"real\"\n", + " print(f\"API key found \u2014 running in REAL mode (model: gpt-4o-mini)\")\n", + "else:\n", + " MODE = \"stub\"\n", + " print(\"WARNING: No OPENROUTER_API_KEY found. Falling back to STUB mode.\")\n", + " print(\" All outputs below are labeled STUB \u2014 not real LLM results.\")\n", + "\n", + "os.environ[\"TB_MODE\"] = MODE\n", + "print(f\"\\nMode: {MODE}\")" + ], + "id": "8D3DGyVXm0UJ" + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "swOi3Bhtm0UQ", + "outputId": "e9806308-35f8-48c5-e6b2-e5f46530a497", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "u5DVjcAAm0UH" - }, - "source": [ - "## Expected Outputs\n", - "\n", - "- A new `runs//` folder with `meta/` + `jobs/` layout.\n", - "- `meta/config.snapshot.yaml`, `meta/manifest.json`, `meta/env.json` exist.\n", - "- `results.csv` contains `status` values (`ok`/`failed`/`skipped`).\n", - "- Internal non-trainable job shows `status=failed` with reason.\n", - "- If running in **real** mode, artifacts show `mode=real` and LLM4AD task produces a score.\n", - "- **2x2 matrix smoke**: `results.csv` with exactly 4 rows from 2 tasks x 2 trainers x 1 seed." - ], - "id": "u5DVjcAAm0UH" - }, + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'Trace-Bench'...\n", + "remote: Enumerating objects: 315, done.\u001b[K\n", + "remote: Counting objects: 100% (315/315), done.\u001b[K\n", + "remote: Compressing objects: 100% (222/222), done.\u001b[K\n", + "remote: Total 315 (delta 42), reused 274 (delta 36), pack-reused 0 (from 0)\u001b[K\n", + "Receiving objects: 100% (315/315), 3.86 MiB | 8.12 MiB/s, done.\n", + "Resolving deltas: 100% (42/42), done.\n", + "Cloning into 'OpenTrace'...\n", + "remote: Enumerating objects: 228, done.\u001b[K\n", + "remote: Counting objects: 100% (228/228), done.\u001b[K\n", + "remote: Compressing objects: 100% (205/205), done.\u001b[K\n", + "remote: Total 228 (delta 17), reused 114 (delta 13), pack-reused 0 (from 0)\u001b[K\n", + "Receiving objects: 100% (228/228), 4.73 MiB | 14.77 MiB/s, done.\n", + "Resolving deltas: 100% (17/17), done.\n", + "/content/Trace-Bench\n", + "Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n", + "Get:2 https://cli.github.com/packages stable InRelease [3,917 B]\n", + "Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [85.0 kB]\n", + "Get:4 https://cli.github.com/packages stable/main amd64 Packages [356 B]\n", + "Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease\n", + "Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n", + "Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n", + "Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n", + "Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,893 kB]\n", + "Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]\n", + "Get:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]\n", + "Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n", + "Get:13 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [6,396 kB]\n", + "Get:14 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [38.8 kB]\n", + "Get:15 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [6,661 kB]\n", + "Get:16 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy/main amd64 Packages [75.3 kB]\n", + "Get:17 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,716 kB]\n", + "Get:18 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,683 kB]\n", + "Get:19 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [62.6 kB]\n", + "Get:20 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,297 kB]\n", + "Get:21 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [70.9 kB]\n", + "Get:22 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [4,035 kB]\n", + "Get:23 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,609 kB]\n", + "Fetched 37.1 MB in 6s (6,435 kB/s)\n", + "Reading package lists... Done\n", + "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n", + "Reading package lists... Done\n", + "Building dependency tree... Done\n", + "Reading state information... Done\n", + "graphviz is already the newest version (2.42.2-6ubuntu0.1).\n", + "0 upgraded, 0 newly installed, 0 to remove and 55 not upgraded.\n", + "Requirement already satisfied: pip in /usr/local/lib/python3.12/dist-packages (24.1.2)\n", + "Collecting pip\n", + " Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)\n", + "Downloading pip-26.0.1-py3-none-any.whl (1.8 MB)\n", + "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: pip\n", + " Attempting uninstall: pip\n", + " Found existing installation: pip 24.1.2\n", + " Uninstalling pip-24.1.2:\n", + " Successfully uninstalled pip-24.1.2\n", + "Successfully installed pip-26.0.1\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (6.0.3)\n", + "Requirement already satisfied: pytest in /usr/local/lib/python3.12/dist-packages (8.4.2)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", + "Requirement already satisfied: graphviz in /usr/local/lib/python3.12/dist-packages (0.21)\n", + "Collecting litellm==1.75.0\n", + " Downloading litellm-1.75.0-py3-none-any.whl.metadata (40 kB)\n", + "Requirement already satisfied: aiohttp>=3.10 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.13.3)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.3.1)\n", + "Requirement already satisfied: httpx>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.28.1)\n", + "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.7.1)\n", + "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.1.6)\n", + "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (4.26.0)\n", + "Requirement already satisfied: openai>=1.68.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.16.0)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.12.3)\n", + "Requirement already satisfied: python-dotenv>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (1.2.1)\n", + "Requirement already satisfied: tiktoken>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.12.0)\n", + "Requirement already satisfied: tokenizers in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.22.2)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2<4.0.0,>=3.1.2->litellm==1.75.0) (3.0.3)\n", + "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (25.4.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (2025.9.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.37.0)\n", + "Requirement already satisfied: rpds-py>=0.25.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.30.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (2.41.4)\n", + "Requirement already satisfied: typing-extensions>=4.14.1 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (4.15.0)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.4.2)\n", + "Requirement already satisfied: iniconfig>=1 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.3.0)\n", + "Requirement already satisfied: packaging>=20 in /usr/local/lib/python3.12/dist-packages (from pytest) (26.0)\n", + "Requirement already satisfied: pluggy<2,>=1.5 in /usr/local/lib/python3.12/dist-packages (from pytest) (1.6.0)\n", + "Requirement already satisfied: pygments>=2.7.2 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.19.2)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", + "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (2.9.0.post0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.4.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.8.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (6.7.1)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (0.4.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.22.0)\n", + "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.12/dist-packages (from yarl<2.0,>=1.17.0->aiohttp>=3.10->litellm==1.75.0) (3.11)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (4.12.1)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (2026.1.4)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (1.0.9)\n", + "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx>=0.23.0->litellm==1.75.0) (0.16.0)\n", + "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.12/dist-packages (from importlib-metadata>=6.8.0->litellm==1.75.0) (3.23.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.9.0)\n", + "Requirement already satisfied: jiter<1,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (0.13.0)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.3.1)\n", + "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (4.67.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n", + "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2025.11.3)\n", + "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2.32.4)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (3.4.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (2.5.0)\n", + "Requirement already satisfied: huggingface-hub<2.0,>=0.16.4 in /usr/local/lib/python3.12/dist-packages (from tokenizers->litellm==1.75.0) (1.3.7)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (3.20.3)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (2025.3.0)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.2.0)\n", + "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.5.4)\n", + "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (0.21.1)\n", + "Downloading litellm-1.75.0-py3-none-any.whl (8.9 MB)\n", + "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m81.9 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: litellm\n", + "Successfully installed litellm-1.75.0\n" + ] + } + ], + "source": [ + "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n", + "!git clone --depth 1 --branch runner-foundation https://github.com/guru-code-expert/Trace-Bench.git\n", + "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n", + "\n", + "%cd Trace-Bench\n", + "\n", + "# System + Python deps\n", + "!apt-get update -y && apt-get install -y graphviz\n", + "!python -m pip install -U pip\n", + "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0" + ], + "id": "swOi3Bhtm0UQ" + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "a__iRJTHm0UR", + "outputId": "f48aba86-b779-4537-f5ce-8d5b2bdc4154", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "8D3DGyVXm0UJ", - "outputId": "879a2cbf-263e-4d80-bf7c-f3f01879432f", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Mounted at /content/drive\n", - "Runs dir: /content/drive/MyDrive/bench/2026-02-11/trace_bench\n", - "API key found — running in REAL mode (model: gpt-4o-mini)\n", - "\n", - "Mode: real\n" - ] - } - ], - "source": [ - "# Mount Drive (optional) + compute persistent runs_dir + detect API key\n", - "from datetime import date\n", - "from pathlib import Path\n", - "import os\n", - "\n", - "try:\n", - " from google.colab import drive\n", - " drive.mount(\"/content/drive\")\n", - "except Exception:\n", - " pass\n", - "\n", - "\n", - "def bench_dir(project=\"bench\", sub=\"trace_bench\", local=\"/content/bench\"):\n", - " drive_root = Path(\"/content/drive/MyDrive\")\n", - " root = drive_root if drive_root.is_dir() else Path(local)\n", - " out = root / project / date.today().isoformat() / sub\n", - " out.mkdir(parents=True, exist_ok=True)\n", - " return str(out)\n", - "\n", - "RUNS_DIR = bench_dir()\n", - "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n", - "print(\"Runs dir:\", RUNS_DIR)\n", - "\n", - "# --- Auto-detect API key (real mode by default) ---\n", - "API_KEY = os.environ.get(\"OPENROUTER_API_KEY\", \"\")\n", - "if not API_KEY:\n", - " try:\n", - " from google.colab import userdata\n", - " API_KEY = userdata.get(\"OPENROUTER_API_KEY\") or \"\"\n", - " except Exception:\n", - " pass\n", - "\n", - "if API_KEY:\n", - " os.environ[\"OPENROUTER_API_KEY\"] = API_KEY\n", - " os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n", - " os.environ[\"TRACE_LITELLM_MODEL\"] = \"openrouter/openai/gpt-4o-mini\"\n", - " MODE = \"real\"\n", - " print(f\"API key found — running in REAL mode (model: gpt-4o-mini)\")\n", - "else:\n", - " MODE = \"stub\"\n", - " print(\"WARNING: No OPENROUTER_API_KEY found. Falling back to STUB mode.\")\n", - " print(\" All outputs below are labeled STUB — not real LLM results.\")\n", - "\n", - "os.environ[\"TB_MODE\"] = MODE\n", - "print(f\"\\nMode: {MODE}\")" - ], - "id": "8D3DGyVXm0UJ" + "output_type": "stream", + "name": "stdout", + "text": [ + "=== List trainers ===\n", + "PrioritySearch\tavailable\n", + "GEPA-Base\tavailable\n", + "GEPA-UCB\tavailable\n", + "GEPA-Beam\tavailable\n", + "\n", + "=== Validate config (strict) ===\n", + "[OK] internal:code_param\n", + "[OK] internal:numeric_param\n", + "[OK] internal:multi_param\n", + "[OK] internal:non_trainable\n", + "[EXPECTED] internal:non_trainable: no_trainable_parameters\n", + "[OK] trace_examples:greeting_stub\n", + "[OK] llm4ad:circle_packing\n", + "[SKIP] veribench:smoke_placeholder: VeriBench tasks not yet wired: awaiting Trace team entrypoint/task list.\n", + "\n", + "[OK] matrix: 28 jobs expanded deterministically\n", + " job 6f3619dd9ae0: internal:code_param x PrioritySearch (seed=123)\n", + " job c486ba93400f: internal:code_param x GEPA-Base (seed=123)\n", + " job a84d2486d31a: internal:code_param x GEPA-UCB (seed=123)\n", + " job 8ecff95cfafa: internal:code_param x GEPA-Beam (seed=123)\n", + " job 778da61d2682: internal:numeric_param x PrioritySearch (seed=123)\n", + " job 4b3a7f322126: internal:numeric_param x GEPA-Base (seed=123)\n", + " job 4b9c7d66d866: internal:numeric_param x GEPA-UCB (seed=123)\n", + " job 54df742bb5e9: internal:numeric_param x GEPA-Beam (seed=123)\n", + " job 0bfef35f6ef3: internal:multi_param x PrioritySearch (seed=123)\n", + " job e06adbe6489b: internal:multi_param x GEPA-Base (seed=123)\n", + " job 8669d9b963d4: internal:multi_param x GEPA-UCB (seed=123)\n", + " job 90d23f88baf7: internal:multi_param x GEPA-Beam (seed=123)\n", + " job d6aa82e5d119: internal:non_trainable x PrioritySearch (seed=123)\n", + " job 4f655637a6dc: internal:non_trainable x GEPA-Base (seed=123)\n", + " job 85940a1b71e7: internal:non_trainable x GEPA-UCB (seed=123)\n", + " job dafcec9c13af: internal:non_trainable x GEPA-Beam (seed=123)\n", + " job e8e9938a4ef6: trace_examples:greeting_stub x PrioritySearch (seed=123)\n", + " job 4715e211f8a9: trace_examples:greeting_stub x GEPA-Base (seed=123)\n", + " job 8c4ec9f3e355: trace_examples:greeting_stub x GEPA-UCB (seed=123)\n", + " job 2f84751a35ad: trace_examples:greeting_stub x GEPA-Beam (seed=123)\n", + " job da0e8ae694f1: llm4ad:circle_packing x PrioritySearch (seed=123)\n", + " job 0865599891de: llm4ad:circle_packing x GEPA-Base (seed=123)\n", + " job d25dcdb59892: llm4ad:circle_packing x GEPA-UCB (seed=123)\n", + " job d985faad90f4: llm4ad:circle_packing x GEPA-Beam (seed=123)\n", + " job 364d89b28934: veribench:smoke_placeholder x PrioritySearch (seed=123)\n", + " job 721282ed015b: veribench:smoke_placeholder x GEPA-Base (seed=123)\n", + " job 5b657b995d7a: veribench:smoke_placeholder x GEPA-UCB (seed=123)\n", + " job 77b3e4cb5bf0: veribench:smoke_placeholder x GEPA-Beam (seed=123)\n", + "\n", + " tasks: ['internal:code_param', 'internal:multi_param', 'internal:non_trainable', 'internal:numeric_param', 'llm4ad:circle_packing', 'trace_examples:greeting_stub', 'veribench:smoke_placeholder']\n", + " trainers: ['GEPA-Base', 'GEPA-Beam', 'GEPA-UCB', 'PrioritySearch']\n", + "[OK] manifest written: runs/20260209-153344-8f7a72b4/meta/manifest.json\n", + "\n", + "=== Generate M1 run config (mode=real) ===\n", + "Config mode: real\n", + "\n", + "=== Run M1 validation ===\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: 1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: 1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n", + " return code\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 1\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 2\n", + "[Step 1] Update/best_candidate_priority: 1.0\n", + "[Step 1] Update/best_candidate_mean_score: 1.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 1\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 1.0\n", + "[Step 1] Sample/num_samples: 1\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 3\u001b[0m\n", + "[Step 1] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n", + " return code\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -3.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -3.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 3\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:2: 1.0\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:3: 1.0\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", + " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -0.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:2: 1.5\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:3: 1.5\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", + " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1000000.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1000000.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code:3: import numpy as np\n", + "import math\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " grid_size = int(np.ceil(np.sqrt(n)))\n", + " radius = 0.5 / grid_size\n", + "\n", + " circles = []\n", + " for i in range(n):\n", + " row = i // grid_size\n", + " col = i % grid_size\n", + " x = (col + 0.5) / grid_size\n", + " y = (row + 0.5) / grid_size\n", + " circles.append([x, y, radius])\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.375582371483138\n", + "[Step 1] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 1.375582371483138\n", + "[Step 1] Update/best_candidate_mean_score: 1.375582371483138\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.0407921408122753\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.0407921408122753\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: -1000000.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code:3: import numpy as np\n", + "import random\n", + "\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " random.seed(2025)\n", + " np.random.seed(2025)\n", + "\n", + " circles = []\n", + " radii = np.random.uniform(0.01, 0.1, size=n) # Random radii between 0.01 and 0.1\n", + "\n", + " for _ in range(n):\n", + " placed = False\n", + " while not placed:\n", + " radius = np.random.choice(radii)\n", + " x = np.random.uniform(radius, 1 - radius)\n", + " y = np.random.uniform(radius, 1 - radius)\n", + " overlap = False\n", + " \n", + " # Check for overlap\n", + " for circle in circles:\n", + " if np.sqrt((circle[0] - x) ** 2 + (circle[1] - y) ** 2) < (circle[2] + radius):\n", + " overlap = True\n", + " break\n", + " \n", + " if not overlap:\n", + " circles.append([x, y, radius])\n", + " placed = True\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: -1000000.0\u001b[0m\n" + ] }, { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "swOi3Bhtm0UQ", - "outputId": "a7df1c4a-e213-46e3-d3ea-83db6eee60b7", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Cloning into 'Trace-Bench'...\n", - "remote: Enumerating objects: 315, done.\u001b[K\n", - "remote: Counting objects: 100% (315/315), done.\u001b[K\n", - "remote: Compressing objects: 100% (217/217), done.\u001b[K\n", - "remote: Total 315 (delta 42), reused 290 (delta 41), pack-reused 0 (from 0)\u001b[K\n", - "Receiving objects: 100% (315/315), 3.86 MiB | 8.81 MiB/s, done.\n", - "Resolving deltas: 100% (42/42), done.\n", - "Cloning into 'OpenTrace'...\n", - "remote: Enumerating objects: 228, done.\u001b[K\n", - "remote: Counting objects: 100% (228/228), done.\u001b[K\n", - "remote: Compressing objects: 100% (205/205), done.\u001b[K\n", - "remote: Total 228 (delta 17), reused 114 (delta 13), pack-reused 0 (from 0)\u001b[K\n", - "Receiving objects: 100% (228/228), 4.73 MiB | 11.14 MiB/s, done.\n", - "Resolving deltas: 100% (17/17), done.\n", - "/content/Trace-Bench\n", - "Get:1 https://cli.github.com/packages stable InRelease [3,917 B]\n", - "Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n", - "Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n", - "Get:4 https://cli.github.com/packages stable/main amd64 Packages [356 B]\n", - "Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [85.0 kB]\n", - "Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n", - "Hit:7 http://archive.ubuntu.com/ubuntu jammy InRelease\n", - "Get:8 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,728 kB]\n", - "Get:9 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n", - "Get:10 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,683 kB]\n", - "Get:11 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]\n", - "Get:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]\n", - "Get:13 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [6,396 kB]\n", - "Get:14 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n", - "Get:15 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [38.8 kB]\n", - "Get:16 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [62.6 kB]\n", - "Get:17 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,297 kB]\n", - "Get:18 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,895 kB]\n", - "Get:19 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,609 kB]\n", - "Get:20 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy/main amd64 Packages [75.3 kB]\n", - "Get:21 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [4,040 kB]\n", - "Get:22 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [6,678 kB]\n", - "Get:23 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [70.9 kB]\n", - "Fetched 37.1 MB in 4s (9,437 kB/s)\n", - "Reading package lists... Done\n", - "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n", - "Reading package lists... Done\n", - "Building dependency tree... Done\n", - "Reading state information... Done\n", - "graphviz is already the newest version (2.42.2-6ubuntu0.1).\n", - "0 upgraded, 0 newly installed, 0 to remove and 57 not upgraded.\n", - "Requirement already satisfied: pip in /usr/local/lib/python3.12/dist-packages (24.1.2)\n", - "Collecting pip\n", - " Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)\n", - "Downloading pip-26.0.1-py3-none-any.whl (1.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m28.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: pip\n", - " Attempting uninstall: pip\n", - " Found existing installation: pip 24.1.2\n", - " Uninstalling pip-24.1.2:\n", - " Successfully uninstalled pip-24.1.2\n", - "Successfully installed pip-26.0.1\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (6.0.3)\n", - "Requirement already satisfied: pytest in /usr/local/lib/python3.12/dist-packages (8.4.2)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", - "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", - "Requirement already satisfied: graphviz in /usr/local/lib/python3.12/dist-packages (0.21)\n", - "Collecting litellm==1.75.0\n", - " Downloading litellm-1.75.0-py3-none-any.whl.metadata (40 kB)\n", - "Requirement already satisfied: aiohttp>=3.10 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.13.3)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.3.1)\n", - "Requirement already satisfied: httpx>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.28.1)\n", - "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.7.1)\n", - "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.1.6)\n", - "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (4.26.0)\n", - "Requirement already satisfied: openai>=1.68.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.17.0)\n", - "Requirement already satisfied: pydantic<3.0.0,>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.12.3)\n", - "Requirement already satisfied: python-dotenv>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (1.2.1)\n", - "Requirement already satisfied: tiktoken>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.12.0)\n", - "Requirement already satisfied: tokenizers in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.22.2)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2<4.0.0,>=3.1.2->litellm==1.75.0) (3.0.3)\n", - "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (25.4.0)\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (2025.9.1)\n", - "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.37.0)\n", - "Requirement already satisfied: rpds-py>=0.25.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.30.0)\n", - "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (2.41.4)\n", - "Requirement already satisfied: typing-extensions>=4.14.1 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (4.15.0)\n", - "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.4.2)\n", - "Requirement already satisfied: iniconfig>=1 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.3.0)\n", - "Requirement already satisfied: packaging>=20 in /usr/local/lib/python3.12/dist-packages (from pytest) (26.0)\n", - "Requirement already satisfied: pluggy<2,>=1.5 in /usr/local/lib/python3.12/dist-packages (from pytest) (1.6.0)\n", - "Requirement already satisfied: pygments>=2.7.2 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.19.2)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", - "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", - "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", - "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (2.9.0.post0)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (2.6.1)\n", - "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.4.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.8.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (6.7.1)\n", - "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (0.4.1)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.22.0)\n", - "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.12/dist-packages (from yarl<2.0,>=1.17.0->aiohttp>=3.10->litellm==1.75.0) (3.11)\n", - "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (4.12.1)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (2026.1.4)\n", - "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (1.0.9)\n", - "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx>=0.23.0->litellm==1.75.0) (0.16.0)\n", - "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.12/dist-packages (from importlib-metadata>=6.8.0->litellm==1.75.0) (3.23.0)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.9.0)\n", - "Requirement already satisfied: jiter<1,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (0.13.0)\n", - "Requirement already satisfied: sniffio in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.3.1)\n", - "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (4.67.3)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n", - "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2025.11.3)\n", - "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2.32.4)\n", - "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (3.4.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (2.5.0)\n", - "Requirement already satisfied: huggingface-hub<2.0,>=0.16.4 in /usr/local/lib/python3.12/dist-packages (from tokenizers->litellm==1.75.0) (1.4.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (3.20.3)\n", - "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (2025.3.0)\n", - "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.2.0)\n", - "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.5.4)\n", - "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (0.21.1)\n", - "Downloading litellm-1.75.0-py3-none-any.whl (8.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m26.1 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: litellm\n", - "Successfully installed litellm-1.75.0\n" - ] - } - ], - "source": [ - "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n", - "!git clone --depth 1 --branch runner-foundation https://github.com/guru-code-expert/Trace-Bench.git\n", - "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n", - "\n", - "%cd Trace-Bench\n", - "\n", - "# System + Python deps\n", - "!apt-get update -y && apt-get install -y graphviz\n", - "!python -m pip install -U pip\n", - "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0" - ], - "id": "swOi3Bhtm0UQ" - }, + "output_type": "stream", + "name": "stderr", + "text": [ + "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_run.yaml < np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - " \n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - " \n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - "\n", - " grid_size = int(np.ceil(np.sqrt(n)))\n", - " radius = 0.5 / grid_size\n", - "\n", - " circles = []\n", - " for i in range(n):\n", - " row = i // grid_size\n", - " col = i % grid_size\n", - " x = (col + 0.5) / grid_size\n", - " y = (row + 0.5) / grid_size\n", - " circles.append([x, y, radius])\n", - "\n", - " return np.array(circles)\u001b[0m\n", - "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: 0.7469557727446884\n", - "[Step 1] \u001b[94mAlgo/Average train score: -499999.6367605793\u001b[0m\n", - "[Step 1] Update/n_iters: 1\n", - "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 5\n", - "[Step 1] Update/using_short_term_memory: False\n", - "[Step 1] Update/using_long_term_memory: True\n", - "[Step 1] Update/total_samples: 6\n", - "[Step 1] Update/best_candidate_priority: 0.7469557727446884\n", - "[Step 1] Update/best_candidate_mean_score: 0.7469557727446884\n", - "[Step 1] Update/best_candidate_num_rollouts: 1\n", - "[Step 1] Update/num_exploration_candidates: 2\n", - "[Step 1] Update/exploration_candidates_mean_priority: 0.7264788414430507\n", - "[Step 1] Update/exploration_candidates_mean_score: 0.7264788414430507\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", - "[Step 1] Sample/mean_score: 0.7264788414430507\n", - "[Step 1] Sample/num_samples: 2\n", - "[Step 1] Sample/self.n_epochs: 1\n", - "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code:4: import numpy as np\n", - "import math\n", - "\n", - "def pack_circles(n: int) -> np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - "\n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - " \n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - "\n", - " np.random.seed(2025) # Set random seed for reproducibility\n", - " circles = []\n", - " total_radius = 0.0\n", - "\n", - " for _ in range(n):\n", - " radius = np.random.uniform(0.01, 0.05) # Dynamically assign radius\n", - " while True:\n", - " x = np.random.uniform(radius, 1 - radius)\n", - " y = np.random.uniform(radius, 1 - radius)\n", - " overlap = False\n", - " for circle in circles:\n", - " if math.hypot(circle[0] - x, circle[1] - y) < (circle[2] + radius):\n", - " overlap = True\n", - " break\n", - " if not overlap:\n", - " circles.append([x, y, radius])\n", - " total_radius += radius\n", - " break\n", - "\n", - " return np.array(circles)\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: 0.5\u001b[0m\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "usage: trace-bench [-h] {list-tasks,list-trainers,validate,run,ui} ...\n", - "trace-bench: error: unrecognized arguments: --runs-dir /content/drive/MyDrive/bench/2026-02-11/trace_bench\n", - "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_run.yaml <\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
run_idjob_idtask_idsuitetrainer_idseedstatusscore_initialscore_finalscore_besttime_secondsresolved_trainer_kwargsresolved_optimizer_kwargseval_kwargsfeedbacktb_logdir
020260211-055528-de435ae56f3619dd9ae0internal:code_paraminternalPrioritySearch123ok1.01.01.08.531946{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/6f3619dd9ae0/tb
120260211-055528-de435ae5c486ba93400finternal:code_paraminternalGEPA-Base123ok1.01.01.03.063799{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/c486ba93400f/tb
220260211-055528-de435ae5778da61d2682internal:numeric_paraminternalPrioritySearch123ok-3.0-0.0-0.03.603461{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/778da61d2682/tb
320260211-055528-de435ae54b3a7f322126internal:numeric_paraminternalGEPA-Base123ok-3.0-0.0-0.04.485549{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/4b3a7f322126/tb
420260211-055528-de435ae50bfef35f6ef3internal:multi_paraminternalPrioritySearch123ok-1.0-0.0-0.04.149766{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Make a+b matc...{\"timeout_seconds\": 10}target=3.0jobs/0bfef35f6ef3/tb
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "df", - "summary": "{\n \"name\": \"df\",\n \"rows\": 14,\n \"fields\": [\n {\n \"column\": \"run_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"20260211-055528-de435ae5\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"job_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"4715e211f8a9\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"internal:code_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"trace_examples\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_initial\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 421636.810540172,\n \"min\": -1000000.0,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_final\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47782940218389114,\n \"min\": -0.0,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47782940218389114,\n \"min\": -0.0,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"time_seconds\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.637736580815169,\n \"min\": 3.3e-05,\n \"max\": 21.165263,\n \"num_unique_values\": 13,\n \"samples\": [\n 7.290935\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_trainer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"{\\\"merge_every\\\": 2, \\\"num_iters\\\": 1, \\\"pareto_subset_size\\\": 2, \\\"train_batch_size\\\": 2}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_optimizer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"{\\\"memory_size\\\": 5, \\\"objective\\\": \\\"Match the target code exactly.\\\"}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"eval_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"{\\\"timeout_seconds\\\": 10}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"feedback\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Correct\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tb_logdir\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"jobs/4715e211f8a9/tb\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" - } - }, - "metadata": {}, - "execution_count": 4 - } + "output_type": "execute_result", + "data": { + "text/plain": [ + " run_id job_id task_id suite \\\n", + "0 20260209-153346-0daa4bb9 6f3619dd9ae0 internal:code_param internal \n", + "1 20260209-153346-0daa4bb9 c486ba93400f internal:code_param internal \n", + "2 20260209-153346-0daa4bb9 778da61d2682 internal:numeric_param internal \n", + "3 20260209-153346-0daa4bb9 4b3a7f322126 internal:numeric_param internal \n", + "4 20260209-153346-0daa4bb9 0bfef35f6ef3 internal:multi_param internal \n", + "\n", + " trainer_id seed status score_initial score_final score_best \\\n", + "0 PrioritySearch 123 ok 1.0 1.0 1.0 \n", + "1 GEPA-Base 123 ok 1.0 1.0 1.0 \n", + "2 PrioritySearch 123 ok -3.0 -0.0 -0.0 \n", + "3 GEPA-Base 123 ok -3.0 -0.0 -0.0 \n", + "4 PrioritySearch 123 ok -1.0 -0.0 -0.0 \n", + "\n", + " time_seconds resolved_trainer_kwargs \\\n", + "0 10.507114 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", + "1 1.279633 {\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub... \n", + "2 4.215786 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", + "3 3.031100 {\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub... \n", + "4 3.620341 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", + "\n", + " resolved_optimizer_kwargs eval_kwargs \\\n", + "0 {\"memory_size\": 5, \"objective\": \"Match the tar... {\"timeout_seconds\": 10} \n", + "1 {\"memory_size\": 5, \"objective\": \"Match the tar... {\"timeout_seconds\": 10} \n", + "2 {\"memory_size\": 5, \"objective\": \"Match the num... {\"timeout_seconds\": 10} \n", + "3 {\"memory_size\": 5, \"objective\": \"Match the num... {\"timeout_seconds\": 10} \n", + "4 {\"memory_size\": 5, \"objective\": \"Make a+b matc... {\"timeout_seconds\": 10} \n", + "\n", + " feedback tb_logdir \n", + "0 Correct jobs/6f3619dd9ae0/tb \n", + "1 Correct jobs/c486ba93400f/tb \n", + "2 target=3.0 jobs/778da61d2682/tb \n", + "3 target=3.0 jobs/4b3a7f322126/tb \n", + "4 target=3.0 jobs/0bfef35f6ef3/tb " ], - "source": [ - "# Inspect latest run artifacts\n", - "import pathlib, json, pandas as pd\n", - "\n", - "runs_root = pathlib.Path(RUNS_DIR)\n", - "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", - "\n", - "run_dir = None\n", - "for p in reversed(candidates):\n", - " if (p / \"meta\" / \"config.snapshot.yaml\").exists():\n", - " run_dir = p\n", - " break\n", - "\n", - "if run_dir is None:\n", - " for p in reversed(candidates):\n", - " if (p / \"config.snapshot.yaml\").exists():\n", - " run_dir = p\n", - " break\n", - "\n", - "if run_dir is None:\n", - " raise FileNotFoundError(\"No run folder with config snapshot found under RUNS_DIR\")\n", - "\n", - "print(\"Run dir:\", run_dir)\n", - "\n", - "config_path = run_dir / \"meta\" / \"config.snapshot.yaml\"\n", - "env_path = run_dir / \"meta\" / \"env.json\"\n", - "manifest_path = run_dir / \"meta\" / \"manifest.json\"\n", - "\n", - "if not config_path.exists():\n", - " config_path = run_dir / \"config.snapshot.yaml\"\n", - " env_path = run_dir / \"env.json\"\n", - "\n", - "config_text = config_path.read_text()\n", - "print(config_text[:400])\n", - "\n", - "if manifest_path.exists():\n", - " manifest = json.loads(manifest_path.read_text())\n", - " print(\"Jobs in manifest:\", len(manifest.get(\"jobs\", [])))\n", - "\n", - "df = pd.read_csv(run_dir / \"results.csv\")\n", - "df.head()\n" + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
run_idjob_idtask_idsuitetrainer_idseedstatusscore_initialscore_finalscore_besttime_secondsresolved_trainer_kwargsresolved_optimizer_kwargseval_kwargsfeedbacktb_logdir
020260209-153346-0daa4bb96f3619dd9ae0internal:code_paraminternalPrioritySearch123ok1.01.01.010.507114{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/6f3619dd9ae0/tb
120260209-153346-0daa4bb9c486ba93400finternal:code_paraminternalGEPA-Base123ok1.01.01.01.279633{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/c486ba93400f/tb
220260209-153346-0daa4bb9778da61d2682internal:numeric_paraminternalPrioritySearch123ok-3.0-0.0-0.04.215786{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/778da61d2682/tb
320260209-153346-0daa4bb94b3a7f322126internal:numeric_paraminternalGEPA-Base123ok-3.0-0.0-0.03.031100{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/4b3a7f322126/tb
420260209-153346-0daa4bb90bfef35f6ef3internal:multi_paraminternalPrioritySearch123ok-1.0-0.0-0.03.620341{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Make a+b matc...{\"timeout_seconds\": 10}target=3.0jobs/0bfef35f6ef3/tb
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" ], - "id": "ckY1HmQam0UU" - }, + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 12,\n \"fields\": [\n {\n \"column\": \"run_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"20260209-153346-0daa4bb9\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"job_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"364d89b28934\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"internal:code_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_initial\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 462909.5869786947,\n \"min\": -1000000.0,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_final\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 353553.5610863874,\n \"min\": -1000000.0,\n \"max\": 1.375582371483138,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 353553.5610863874,\n \"min\": -1000000.0,\n \"max\": 1.375582371483138,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"time_seconds\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8.86582048810776,\n \"min\": 3.5e-05,\n \"max\": 28.849823,\n \"num_unique_values\": 12,\n \"samples\": [\n 4.2e-05\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_trainer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"{\\\"merge_every\\\": 2, \\\"num_iters\\\": 1, \\\"pareto_subset_size\\\": 2, \\\"train_batch_size\\\": 2}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_optimizer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"{\\\"memory_size\\\": 5, \\\"objective\\\": \\\"Match the numeric target value.\\\"}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"eval_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"{\\\"timeout_seconds\\\": 10}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"feedback\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Correct\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tb_logdir\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"jobs/364d89b28934/tb\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "# Inspect latest run artifacts\n", + "import pathlib, json, pandas as pd\n", + "\n", + "runs_root = pathlib.Path(RUNS_DIR)\n", + "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", + "\n", + "run_dir = None\n", + "for p in reversed(candidates):\n", + " if (p / \"meta\" / \"config.snapshot.yaml\").exists():\n", + " run_dir = p\n", + " break\n", + "\n", + "if run_dir is None:\n", + " for p in reversed(candidates):\n", + " if (p / \"config.snapshot.yaml\").exists():\n", + " run_dir = p\n", + " break\n", + "\n", + "if run_dir is None:\n", + " raise FileNotFoundError(\"No run folder with config snapshot found under RUNS_DIR\")\n", + "\n", + "print(\"Run dir:\", run_dir)\n", + "\n", + "config_path = run_dir / \"meta\" / \"config.snapshot.yaml\"\n", + "env_path = run_dir / \"meta\" / \"env.json\"\n", + "manifest_path = run_dir / \"meta\" / \"manifest.json\"\n", + "\n", + "if not config_path.exists():\n", + " config_path = run_dir / \"config.snapshot.yaml\"\n", + " env_path = run_dir / \"env.json\"\n", + "\n", + "config_text = config_path.read_text()\n", + "print(config_text[:400])\n", + "\n", + "if manifest_path.exists():\n", + " manifest = json.loads(manifest_path.read_text())\n", + " print(\"Jobs in manifest:\", len(manifest.get(\"jobs\", [])))\n", + "\n", + "df = pd.read_csv(run_dir / \"results.csv\")\n", + "df.head()\n" + ], + "id": "ckY1HmQam0UU" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gpkb4-1Em0UW" + }, + "source": [ + "## 2x2 Bounded Matrix Smoke (Plan A+ Pareto)\n", + "\n", + "Run exactly **2 tasks x 2 trainers x 1 seed = 4 jobs** and verify `results.csv` has 4 rows." + ], + "id": "gpkb4-1Em0UW" + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "dMn7PDVgm0UX", + "outputId": "c37fef05-49b8-4180-dbc9-4b32fd20d45c", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "gpkb4-1Em0UW" - }, - "source": [ - "## 2x2 Bounded Matrix Smoke (Plan A+ Pareto)\n", - "\n", - "Run exactly **2 tasks x 2 trainers x 1 seed = 4 jobs** and verify `results.csv` has 4 rows." - ], - "id": "gpkb4-1Em0UW" + "output_type": "stream", + "name": "stdout", + "text": [ + "=== 2x2 Matrix Smoke (mode=real) ===\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with only long-term memory.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -3.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -3.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 3\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.0\n", + "[Step 1] Update/best_candidate_mean_score: 0.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 0.0\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n", + " return value\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with only long-term memory.\n", + "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: -1000000.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: -1000000.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code:1: import numpy as np\n", + "import math\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " grid_size = int(np.ceil(np.sqrt(n)))\n", + " radius = 0.5 / grid_size\n", + "\n", + " circles = []\n", + " for i in range(n):\n", + " row = i // grid_size\n", + " col = i % grid_size\n", + " x = (col + 0.5) / grid_size\n", + " y = (row + 0.5) / grid_size\n", + " circles.append([x, y, radius])\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 0.6499617928349034\n", + "[Step 1] \u001b[94mAlgo/Average train score: -749999.8375095518\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 0.6499617928349034\n", + "[Step 1] Update/best_candidate_mean_score: 0.6499617928349034\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: -499999.67501910357\n", + "[Step 1] Update/exploration_candidates_mean_score: -499999.67501910357\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: -499999.67501910357\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code:1: import numpy as np\n", + "import math\n", + "\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + "\n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + "\n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + " np.random.seed(2025)\n", + " \n", + " circles = []\n", + " for _ in range(n):\n", + " radius = np.random.rand() * 0.05 # Variable radius, capped to keep circles small\n", + " x, y = np.random.rand(2) * (1 - 2 * radius) + radius # Ensures circles fit in unit square\n", + "\n", + " # Check for overlapping\n", + " while any(np.linalg.norm([x - circle[0], y - circle[1]]) < (radius + circle[2]) for circle in circles):\n", + " x, y = np.random.rand(2) * (1 - 2 * radius) + radius # Reposition if overlap detected\n", + "\n", + " circles.append([x, y, radius])\n", + " \n", + " return np.array(circles)\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.4689943904012859\u001b[0m\n" + ] }, { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "dMn7PDVgm0UX", - "outputId": "a437b815-12a5-4096-f8e6-34157d8c15b5", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "=== 2x2 Matrix Smoke (mode=real) ===\n", - "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", - "PrioritySearch initialized with only long-term memory.\n", - "Epoch: 0. Iteration: 0\n", - "[Step 0] Test/test_score: -3.0\n", - "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n", - "[Step 0] Update/n_iters: 0\n", - "[Step 0] Update/short_term_memory_size: 0\n", - "[Step 0] Update/long_term_memory_size: 2\n", - "[Step 0] Update/using_short_term_memory: False\n", - "[Step 0] Update/using_long_term_memory: True\n", - "[Step 0] Update/total_samples: 0\n", - "[Step 0] Update/best_candidate_priority: inf\n", - "[Step 0] Update/best_candidate_num_rollouts: 0\n", - "[Step 0] Update/num_exploration_candidates: 2\n", - "[Step 0] Update/exploration_candidates_mean_priority: inf\n", - "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", - "[Step 0] Sample/mean_score: -3.0\n", - "[Step 0] Sample/num_samples: 2\n", - "[Step 0] Sample/self.n_epochs: 0\n", - "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", - "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n", - " return value\u001b[0m\n", - "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n", - "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: 0.0\n", - "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n", - "[Step 1] Update/n_iters: 1\n", - "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 3\n", - "[Step 1] Update/using_short_term_memory: False\n", - "[Step 1] Update/using_long_term_memory: True\n", - "[Step 1] Update/total_samples: 6\n", - "[Step 1] Update/best_candidate_priority: 0.0\n", - "[Step 1] Update/best_candidate_mean_score: 0.0\n", - "[Step 1] Update/best_candidate_num_rollouts: 2\n", - "[Step 1] Update/num_exploration_candidates: 2\n", - "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", - "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", - "[Step 1] Sample/mean_score: 0.0\n", - "[Step 1] Sample/num_samples: 2\n", - "[Step 1] Sample/self.n_epochs: 1\n", - "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n", - " return value\u001b[0m\n", - "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", - "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", - "PrioritySearch initialized with only long-term memory.\n", - "Epoch: 0. Iteration: 0\n", - "[Step 0] Test/test_score: -1000000.0\n", - "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", - "[Step 0] Update/n_iters: 0\n", - "[Step 0] Update/short_term_memory_size: 0\n", - "[Step 0] Update/long_term_memory_size: 2\n", - "[Step 0] Update/using_short_term_memory: False\n", - "[Step 0] Update/using_long_term_memory: True\n", - "[Step 0] Update/total_samples: 0\n", - "[Step 0] Update/best_candidate_priority: inf\n", - "[Step 0] Update/best_candidate_num_rollouts: 0\n", - "[Step 0] Update/num_exploration_candidates: 2\n", - "[Step 0] Update/exploration_candidates_mean_priority: inf\n", - "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", - "[Step 0] Sample/mean_score: -1000000.0\n", - "[Step 0] Sample/num_samples: 2\n", - "[Step 0] Sample/self.n_epochs: 0\n", - "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", - "[Step 0] \u001b[91mParameter/__code:1: import numpy as np\n", - "import math\n", - "def pack_circles(n: int) -> np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - " \n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - " \n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - "\n", - " grid_size = int(np.ceil(np.sqrt(n)))\n", - " radius = 0.5 / grid_size\n", - "\n", - " circles = []\n", - " for i in range(n):\n", - " row = i // grid_size\n", - " col = i % grid_size\n", - " x = (col + 0.5) / grid_size\n", - " y = (row + 0.5) / grid_size\n", - " circles.append([x, y, radius])\n", - "\n", - " return np.array(circles)\u001b[0m\n", - "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: 1.3000000000000003\n", - "[Step 1] \u001b[94mAlgo/Average train score: -499999.545\u001b[0m\n", - "[Step 1] Update/n_iters: 1\n", - "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 5\n", - "[Step 1] Update/using_short_term_memory: False\n", - "[Step 1] Update/using_long_term_memory: True\n", - "[Step 1] Update/total_samples: 6\n", - "[Step 1] Update/best_candidate_priority: 1.3000000000000003\n", - "[Step 1] Update/best_candidate_mean_score: 1.3000000000000003\n", - "[Step 1] Update/best_candidate_num_rollouts: 1\n", - "[Step 1] Update/num_exploration_candidates: 2\n", - "[Step 1] Update/exploration_candidates_mean_priority: 0.9100000000000001\n", - "[Step 1] Update/exploration_candidates_mean_score: 0.9100000000000001\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", - "[Step 1] Sample/mean_score: 0.9100000000000001\n", - "[Step 1] Sample/num_samples: 2\n", - "[Step 1] Sample/self.n_epochs: 1\n", - "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code:1: import numpy as np\n", - "import random\n", - "\n", - "def pack_circles(n: int) -> np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - " \n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - " \n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - " random.seed(2025)\n", - " np.random.seed(2025)\n", - " \n", - " circles = []\n", - " radius = 0.05 # Starting radius\n", - "\n", - " for _ in range(n):\n", - " while True:\n", - " x = np.random.uniform(radius, 1 - radius)\n", - " y = np.random.uniform(radius, 1 - radius)\n", - " overlap = False\n", - " \n", - " for cx, cy, r in circles:\n", - " distance = np.sqrt((cx - x) ** 2 + (cy - y) ** 2)\n", - " if distance < (r + radius):\n", - " overlap = True\n", - " break\n", - " \n", - " if not overlap:\n", - " circles.append([x, y, radius])\n", - " break\n", - " \n", - " return np.array(circles)\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: -1000000.0\u001b[0m\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_matrix.yaml < /content/m1_matrix.yaml <\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
task_idsuitetrainer_idseedstatusscore_best
0internal:numeric_paraminternalPrioritySearch123ok-0.0
1internal:numeric_paraminternalGEPA-Base123ok-0.0
2llm4ad:circle_packingllm4adPrioritySearch123ok1.3
3llm4ad:circle_packingllm4adGEPA-Base123ok-1000000.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "summary": "{\n \"name\": \"df[[\\\"task_id\\\", \\\"suite\\\", \\\"trainer_id\\\", \\\"seed\\\", \\\"status\\\", \\\"score_best\\\"]]\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad:circle_packing\",\n \"internal:numeric_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad\",\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\",\n \"PrioritySearch\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 500000.2166670422,\n \"min\": -1000000.0,\n \"max\": 1.3000000000000005,\n \"num_unique_values\": 3,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" - } - }, - "metadata": {}, - "execution_count": 6 - } + "output_type": "execute_result", + "data": { + "text/plain": [ + " task_id suite trainer_id seed status score_best\n", + "0 internal:numeric_param internal PrioritySearch 123 ok -0.000000\n", + "1 internal:numeric_param internal GEPA-Base 123 ok -0.000000\n", + "2 llm4ad:circle_packing llm4ad PrioritySearch 123 ok 0.649962\n", + "3 llm4ad:circle_packing llm4ad GEPA-Base 123 ok 1.468994" ], - "source": [ - "# Verify 2x2 matrix: exactly 4 rows in results.csv\n", - "import json, pathlib, pandas as pd\n", - "\n", - "runs_root = pathlib.Path(RUNS_DIR)\n", - "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", - "\n", - "matrix_dir = None\n", - "for p in reversed(candidates):\n", - " summary_path = p / \"summary.json\"\n", - " if not summary_path.exists():\n", - " continue\n", - " try:\n", - " summary = json.loads(summary_path.read_text())\n", - " except Exception:\n", - " continue\n", - " if summary.get(\"total_jobs\") == 4:\n", - " matrix_dir = p\n", - " break\n", - "\n", - "if matrix_dir is None:\n", - " raise FileNotFoundError(\"No matrix run with total_jobs==4 found. Re-run the matrix cell.\")\n", - "\n", - "print(\"Matrix run dir:\", matrix_dir)\n", - "\n", - "df = pd.read_csv(matrix_dir / \"results.csv\")\n", - "print(f\"\\nresults.csv rows: {len(df)} (expected: 4)\")\n", - "assert len(df) == 4, f\"Expected 4 rows, got {len(df)}\"\n", - "\n", - "summary = json.loads((matrix_dir / \"summary.json\").read_text())\n", - "print(f\"summary.json: {summary}\")\n", - "assert summary.get(\"total_jobs\") == 4\n", - "\n", - "print(\"\\n--- Matrix results ---\")\n", - "df[[\"task_id\", \"suite\", \"trainer_id\", \"seed\", \"status\", \"score_best\"]]\n" + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
task_idsuitetrainer_idseedstatusscore_best
0internal:numeric_paraminternalPrioritySearch123ok-0.000000
1internal:numeric_paraminternalGEPA-Base123ok-0.000000
2llm4ad:circle_packingllm4adPrioritySearch123ok0.649962
3llm4ad:circle_packingllm4adGEPA-Base123ok1.468994
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" ], - "id": "W18tGXfYm0UZ" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.10" - }, - "colab": { - "provenance": [] + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df[[\\\"task_id\\\", \\\"suite\\\", \\\"trainer_id\\\", \\\"seed\\\", \\\"status\\\", \\\"score_best\\\"]]\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad:circle_packing\",\n \"internal:numeric_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad\",\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\",\n \"PrioritySearch\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.697113339555075,\n \"min\": -0.0,\n \"max\": 1.468994390401286,\n \"num_unique_values\": 3,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 6 } + ], + "source": [ + "# Verify 2x2 matrix: exactly 4 rows in results.csv\n", + "import json, pathlib, pandas as pd\n", + "\n", + "runs_root = pathlib.Path(RUNS_DIR)\n", + "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", + "\n", + "matrix_dir = None\n", + "for p in reversed(candidates):\n", + " summary_path = p / \"summary.json\"\n", + " if not summary_path.exists():\n", + " continue\n", + " try:\n", + " summary = json.loads(summary_path.read_text())\n", + " except Exception:\n", + " continue\n", + " if summary.get(\"total_jobs\") == 4:\n", + " matrix_dir = p\n", + " break\n", + "\n", + "if matrix_dir is None:\n", + " raise FileNotFoundError(\"No matrix run with total_jobs==4 found. Re-run the matrix cell.\")\n", + "\n", + "print(\"Matrix run dir:\", matrix_dir)\n", + "\n", + "df = pd.read_csv(matrix_dir / \"results.csv\")\n", + "print(f\"\\nresults.csv rows: {len(df)} (expected: 4)\")\n", + "assert len(df) == 4, f\"Expected 4 rows, got {len(df)}\"\n", + "\n", + "summary = json.loads((matrix_dir / \"summary.json\").read_text())\n", + "print(f\"summary.json: {summary}\")\n", + "assert summary.get(\"total_jobs\") == 4\n", + "\n", + "print(\"\\n--- Matrix results ---\")\n", + "df[[\"task_id\", \"suite\", \"trainer_id\", \"seed\", \"status\", \"score_best\"]]\n" + ], + "id": "W18tGXfYm0UZ" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 6c588da283e93777b84f05f81c81c8a5fad2c964 Mon Sep 17 00:00:00 2001 From: Asad Date: Wed, 11 Feb 2026 17:46:57 +0500 Subject: [PATCH 6/8] FIX M1-critical items --- .gitignore | 2 + notebooks/01_m1_minimal_api.ipynb | 484 ++++++++++++------------ tests/m1/test_artifact_serialization.py | 57 +++ tests/m1/test_manifest_truth.py | 42 ++ tests/m1/test_validate_runs_dir.py | 37 ++ trace_bench/artifacts.py | 52 ++- trace_bench/cli.py | 37 +- trace_bench/results.py | 7 +- trace_bench/runner.py | 63 ++- 9 files changed, 514 insertions(+), 267 deletions(-) create mode 100644 tests/m1/test_artifact_serialization.py create mode 100644 tests/m1/test_manifest_truth.py create mode 100644 tests/m1/test_validate_runs_dir.py diff --git a/.gitignore b/.gitignore index 9fdd1f6..4ef3b31 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ runs/ runs_test/ notebooks/01_smoke_runner_with_output.ipynb notebooks/01_m1_minimal_api_with_output.ipynb +/.tmp_runs_run +/.tmp_runs_validate diff --git a/notebooks/01_m1_minimal_api.ipynb b/notebooks/01_m1_minimal_api.ipynb index 4d8670c..0652c14 100644 --- a/notebooks/01_m1_minimal_api.ipynb +++ b/notebooks/01_m1_minimal_api.ipynb @@ -6,7 +6,7 @@ "id": "euYNX4m-m0Ty" }, "source": [ - "# Trace-Bench M1 \u2014 Minimal API Validation\n", + "# Trace-Bench M1 — Minimal API Validation\n", "\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guru-code-expert/Trace-Bench/blob/m1/deliverable/notebooks/01_m1_minimal_api.ipynb)\n", "\n", @@ -38,7 +38,7 @@ "execution_count": 1, "metadata": { "id": "8D3DGyVXm0UJ", - "outputId": "aadad0ba-037c-4ffc-8d5a-4c55fb9d0d3f", + "outputId": "7d4561ca-a602-4d08-dc1a-8fc7f0ffd9bd", "colab": { "base_uri": "https://localhost:8080/" } @@ -49,8 +49,8 @@ "name": "stdout", "text": [ "Mounted at /content/drive\n", - "Runs dir: /content/drive/MyDrive/bench/2026-02-09/trace_bench\n", - "API key found \u2014 running in REAL mode (model: gpt-4o-mini)\n", + "Runs dir: /content/drive/MyDrive/bench/2026-02-11/trace_bench\n", + "API key found — running in REAL mode (model: gpt-4o-mini)\n", "\n", "Mode: real\n" ] @@ -89,19 +89,26 @@ " except Exception:\n", " pass\n", "\n", + "MODEL = os.environ.get(\"OPENROUTER_MODEL\", \"openrouter/openai/gpt-4o-mini\")\n", + "\n", "if API_KEY:\n", " os.environ[\"OPENROUTER_API_KEY\"] = API_KEY\n", + " # Compatibility for OpenAI-style clients used internally by optimizers.\n", + " os.environ[\"OPENAI_API_KEY\"] = API_KEY\n", + " os.environ[\"OPENAI_API_BASE\"] = \"https://openrouter.ai/api/v1\"\n", + " os.environ[\"OPENAI_BASE_URL\"] = \"https://openrouter.ai/api/v1\"\n", " os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n", - " os.environ[\"TRACE_LITELLM_MODEL\"] = \"openrouter/openai/gpt-4o-mini\"\n", + " os.environ[\"TRACE_LITELLM_MODEL\"] = MODEL\n", " MODE = \"real\"\n", - " print(f\"API key found \u2014 running in REAL mode (model: gpt-4o-mini)\")\n", + " print(f\"API key found ? running in REAL mode (model: {MODEL})\")\n", "else:\n", " MODE = \"stub\"\n", " print(\"WARNING: No OPENROUTER_API_KEY found. Falling back to STUB mode.\")\n", - " print(\" All outputs below are labeled STUB \u2014 not real LLM results.\")\n", + " print(\" All outputs below are labeled STUB ? not real LLM results.\")\n", "\n", "os.environ[\"TB_MODE\"] = MODE\n", - "print(f\"\\nMode: {MODE}\")" + "print(f\"\n", + "Mode: {MODE}\")\n" ], "id": "8D3DGyVXm0UJ" }, @@ -110,7 +117,7 @@ "execution_count": 2, "metadata": { "id": "swOi3Bhtm0UQ", - "outputId": "e9806308-35f8-48c5-e6b2-e5f46530a497", + "outputId": "7f54c901-77a3-41fd-d41f-ba7487bd6dd4", "colab": { "base_uri": "https://localhost:8080/" } @@ -123,54 +130,54 @@ "Cloning into 'Trace-Bench'...\n", "remote: Enumerating objects: 315, done.\u001b[K\n", "remote: Counting objects: 100% (315/315), done.\u001b[K\n", - "remote: Compressing objects: 100% (222/222), done.\u001b[K\n", - "remote: Total 315 (delta 42), reused 274 (delta 36), pack-reused 0 (from 0)\u001b[K\n", - "Receiving objects: 100% (315/315), 3.86 MiB | 8.12 MiB/s, done.\n", + "remote: Compressing objects: 100% (217/217), done.\u001b[K\n", + "remote: Total 315 (delta 42), reused 290 (delta 41), pack-reused 0 (from 0)\u001b[K\n", + "Receiving objects: 100% (315/315), 3.86 MiB | 15.95 MiB/s, done.\n", "Resolving deltas: 100% (42/42), done.\n", "Cloning into 'OpenTrace'...\n", "remote: Enumerating objects: 228, done.\u001b[K\n", "remote: Counting objects: 100% (228/228), done.\u001b[K\n", "remote: Compressing objects: 100% (205/205), done.\u001b[K\n", - "remote: Total 228 (delta 17), reused 114 (delta 13), pack-reused 0 (from 0)\u001b[K\n", - "Receiving objects: 100% (228/228), 4.73 MiB | 14.77 MiB/s, done.\n", + "remote: Total 228 (delta 17), reused 115 (delta 13), pack-reused 0 (from 0)\u001b[K\n", + "Receiving objects: 100% (228/228), 4.73 MiB | 28.34 MiB/s, done.\n", "Resolving deltas: 100% (17/17), done.\n", "/content/Trace-Bench\n", - "Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n", + "Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n", "Get:2 https://cli.github.com/packages stable InRelease [3,917 B]\n", - "Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [85.0 kB]\n", + "Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n", "Get:4 https://cli.github.com/packages stable/main amd64 Packages [356 B]\n", "Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease\n", - "Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n", + "Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n", "Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n", - "Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n", - "Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,893 kB]\n", + "Get:8 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [85.0 kB]\n", + "Get:9 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,297 kB]\n", "Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]\n", - "Get:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]\n", - "Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n", - "Get:13 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [6,396 kB]\n", - "Get:14 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [38.8 kB]\n", - "Get:15 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [6,661 kB]\n", - "Get:16 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy/main amd64 Packages [75.3 kB]\n", - "Get:17 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,716 kB]\n", - "Get:18 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,683 kB]\n", - "Get:19 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [62.6 kB]\n", - "Get:20 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,297 kB]\n", + "Get:11 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [62.6 kB]\n", + "Get:12 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [6,396 kB]\n", + "Get:13 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,683 kB]\n", + "Get:14 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]\n", + "Get:15 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n", + "Get:16 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,895 kB]\n", + "Get:17 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [38.8 kB]\n", + "Get:18 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,728 kB]\n", + "Get:19 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [6,678 kB]\n", + "Get:20 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy/main amd64 Packages [75.3 kB]\n", "Get:21 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [70.9 kB]\n", - "Get:22 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [4,035 kB]\n", + "Get:22 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [4,040 kB]\n", "Get:23 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,609 kB]\n", - "Fetched 37.1 MB in 6s (6,435 kB/s)\n", + "Fetched 37.1 MB in 4s (9,192 kB/s)\n", "Reading package lists... Done\n", "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n", "Reading package lists... Done\n", "Building dependency tree... Done\n", "Reading state information... Done\n", "graphviz is already the newest version (2.42.2-6ubuntu0.1).\n", - "0 upgraded, 0 newly installed, 0 to remove and 55 not upgraded.\n", + "0 upgraded, 0 newly installed, 0 to remove and 57 not upgraded.\n", "Requirement already satisfied: pip in /usr/local/lib/python3.12/dist-packages (24.1.2)\n", "Collecting pip\n", " Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)\n", "Downloading pip-26.0.1-py3-none-any.whl (1.8 MB)\n", - "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m71.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: pip\n", " Attempting uninstall: pip\n", " Found existing installation: pip 24.1.2\n", @@ -190,7 +197,7 @@ "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.7.1)\n", "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.1.6)\n", "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (4.26.0)\n", - "Requirement already satisfied: openai>=1.68.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.16.0)\n", + "Requirement already satisfied: openai>=1.68.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.17.0)\n", "Requirement already satisfied: pydantic<3.0.0,>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.12.3)\n", "Requirement already satisfied: python-dotenv>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (1.2.1)\n", "Requirement already satisfied: tiktoken>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.12.0)\n", @@ -230,20 +237,20 @@ "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.9.0)\n", "Requirement already satisfied: jiter<1,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (0.13.0)\n", "Requirement already satisfied: sniffio in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.3.1)\n", - "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (4.67.2)\n", + "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (4.67.3)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n", "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2025.11.3)\n", "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2.32.4)\n", "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (3.4.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (2.5.0)\n", - "Requirement already satisfied: huggingface-hub<2.0,>=0.16.4 in /usr/local/lib/python3.12/dist-packages (from tokenizers->litellm==1.75.0) (1.3.7)\n", + "Requirement already satisfied: huggingface-hub<2.0,>=0.16.4 in /usr/local/lib/python3.12/dist-packages (from tokenizers->litellm==1.75.0) (1.4.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (3.20.3)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (2025.3.0)\n", "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.2.0)\n", "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.5.4)\n", "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (0.21.1)\n", "Downloading litellm-1.75.0-py3-none-any.whl (8.9 MB)\n", - "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m81.9 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m103.7 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: litellm\n", "Successfully installed litellm-1.75.0\n" ] @@ -251,7 +258,7 @@ ], "source": [ "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n", - "!git clone --depth 1 --branch runner-foundation https://github.com/guru-code-expert/Trace-Bench.git\n", + "!git clone --depth 1 --branch m1/deliverable https://github.com/guru-code-expert/Trace-Bench.git\n", "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n", "\n", "%cd Trace-Bench\n", @@ -259,7 +266,7 @@ "# System + Python deps\n", "!apt-get update -y && apt-get install -y graphviz\n", "!python -m pip install -U pip\n", - "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0" + "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0\n" ], "id": "swOi3Bhtm0UQ" }, @@ -268,7 +275,7 @@ "execution_count": 3, "metadata": { "id": "a__iRJTHm0UR", - "outputId": "f48aba86-b779-4537-f5ce-8d5b2bdc4154", + "outputId": "13119120-f658-48a6-f4b2-ea3bcbc16476", "colab": { "base_uri": "https://localhost:8080/" } @@ -285,48 +292,6 @@ "GEPA-Beam\tavailable\n", "\n", "=== Validate config (strict) ===\n", - "[OK] internal:code_param\n", - "[OK] internal:numeric_param\n", - "[OK] internal:multi_param\n", - "[OK] internal:non_trainable\n", - "[EXPECTED] internal:non_trainable: no_trainable_parameters\n", - "[OK] trace_examples:greeting_stub\n", - "[OK] llm4ad:circle_packing\n", - "[SKIP] veribench:smoke_placeholder: VeriBench tasks not yet wired: awaiting Trace team entrypoint/task list.\n", - "\n", - "[OK] matrix: 28 jobs expanded deterministically\n", - " job 6f3619dd9ae0: internal:code_param x PrioritySearch (seed=123)\n", - " job c486ba93400f: internal:code_param x GEPA-Base (seed=123)\n", - " job a84d2486d31a: internal:code_param x GEPA-UCB (seed=123)\n", - " job 8ecff95cfafa: internal:code_param x GEPA-Beam (seed=123)\n", - " job 778da61d2682: internal:numeric_param x PrioritySearch (seed=123)\n", - " job 4b3a7f322126: internal:numeric_param x GEPA-Base (seed=123)\n", - " job 4b9c7d66d866: internal:numeric_param x GEPA-UCB (seed=123)\n", - " job 54df742bb5e9: internal:numeric_param x GEPA-Beam (seed=123)\n", - " job 0bfef35f6ef3: internal:multi_param x PrioritySearch (seed=123)\n", - " job e06adbe6489b: internal:multi_param x GEPA-Base (seed=123)\n", - " job 8669d9b963d4: internal:multi_param x GEPA-UCB (seed=123)\n", - " job 90d23f88baf7: internal:multi_param x GEPA-Beam (seed=123)\n", - " job d6aa82e5d119: internal:non_trainable x PrioritySearch (seed=123)\n", - " job 4f655637a6dc: internal:non_trainable x GEPA-Base (seed=123)\n", - " job 85940a1b71e7: internal:non_trainable x GEPA-UCB (seed=123)\n", - " job dafcec9c13af: internal:non_trainable x GEPA-Beam (seed=123)\n", - " job e8e9938a4ef6: trace_examples:greeting_stub x PrioritySearch (seed=123)\n", - " job 4715e211f8a9: trace_examples:greeting_stub x GEPA-Base (seed=123)\n", - " job 8c4ec9f3e355: trace_examples:greeting_stub x GEPA-UCB (seed=123)\n", - " job 2f84751a35ad: trace_examples:greeting_stub x GEPA-Beam (seed=123)\n", - " job da0e8ae694f1: llm4ad:circle_packing x PrioritySearch (seed=123)\n", - " job 0865599891de: llm4ad:circle_packing x GEPA-Base (seed=123)\n", - " job d25dcdb59892: llm4ad:circle_packing x GEPA-UCB (seed=123)\n", - " job d985faad90f4: llm4ad:circle_packing x GEPA-Beam (seed=123)\n", - " job 364d89b28934: veribench:smoke_placeholder x PrioritySearch (seed=123)\n", - " job 721282ed015b: veribench:smoke_placeholder x GEPA-Base (seed=123)\n", - " job 5b657b995d7a: veribench:smoke_placeholder x GEPA-UCB (seed=123)\n", - " job 77b3e4cb5bf0: veribench:smoke_placeholder x GEPA-Beam (seed=123)\n", - "\n", - " tasks: ['internal:code_param', 'internal:multi_param', 'internal:non_trainable', 'internal:numeric_param', 'llm4ad:circle_packing', 'trace_examples:greeting_stub', 'veribench:smoke_placeholder']\n", - " trainers: ['GEPA-Base', 'GEPA-Beam', 'GEPA-UCB', 'PrioritySearch']\n", - "[OK] manifest written: runs/20260209-153344-8f7a72b4/meta/manifest.json\n", "\n", "=== Generate M1 run config (mode=real) ===\n", "Config mode: real\n", @@ -455,7 +420,7 @@ "[Step 1] \u001b[94mAlgo/Average train score: -0.5\u001b[0m\n", "[Step 1] Update/n_iters: 1\n", "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/long_term_memory_size: 4\n", "[Step 1] Update/using_short_term_memory: False\n", "[Step 1] Update/using_long_term_memory: True\n", "[Step 1] Update/total_samples: 6\n", @@ -465,19 +430,68 @@ "[Step 1] Update/num_exploration_candidates: 2\n", "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n", "[Step 1] Update/exploration_candidates_mean_score: 0.0\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.5\n", "[Step 1] Sample/mean_score: 0.0\n", "[Step 1] Sample/num_samples: 2\n", "[Step 1] Sample/self.n_epochs: 1\n", "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/float:2: 1.5\u001b[0m\n", - "[Step 1] \u001b[91mParameter/float:3: 1.5\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:2: 2.0\u001b[0m\n", + "[Step 1] \u001b[91mParameter/float:3: 1.0\u001b[0m\n", "[Step 1] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n", " return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n", "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n", "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", "Epoch: 0. Iteration: 0\n", + "[Step 0] Test/test_score: 1.0\n", + "[Step 0] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 0] Update/n_iters: 0\n", + "[Step 0] Update/short_term_memory_size: 0\n", + "[Step 0] Update/long_term_memory_size: 2\n", + "[Step 0] Update/using_short_term_memory: False\n", + "[Step 0] Update/using_long_term_memory: True\n", + "[Step 0] Update/total_samples: 0\n", + "[Step 0] Update/best_candidate_priority: inf\n", + "[Step 0] Update/best_candidate_num_rollouts: 0\n", + "[Step 0] Update/num_exploration_candidates: 2\n", + "[Step 0] Update/exploration_candidates_mean_priority: inf\n", + "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n", + "[Step 0] Sample/mean_score: 1.0\n", + "[Step 0] Sample/num_samples: 2\n", + "[Step 0] Sample/self.n_epochs: 0\n", + "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", + "[Step 0] \u001b[91mParameter/__code3_copy:0: def compose(self, greeting, name: str):\n", + " greeting_value = getattr(greeting, \"data\", greeting)\n", + " return f\"{greeting_value}, {name}!\"\u001b[0m\n", + "[Step 0] \u001b[91mParameter/str:22: Hello\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 1\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 2\n", + "[Step 1] Update/best_candidate_priority: 1.0\n", + "[Step 1] Update/best_candidate_mean_score: 1.0\n", + "[Step 1] Update/best_candidate_num_rollouts: 2\n", + "[Step 1] Update/num_exploration_candidates: 1\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.0\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.0\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n", + "[Step 1] Sample/mean_score: 1.0\n", + "[Step 1] Sample/num_samples: 1\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 3\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code3_copy:0: def compose(self, greeting, name: str):\n", + " greeting_value = getattr(greeting, \"data\", greeting)\n", + " return f\"{greeting_value}, {name}!\"\u001b[0m\n", + "[Step 1] \u001b[91mParameter/str:22: Hello\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.0\u001b[0m\n", + "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n", + "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n", + "Epoch: 0. Iteration: 0\n", "[Step 0] Test/test_score: -1000000.0\n", "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", "[Step 0] Update/n_iters: 0\n", @@ -495,7 +509,7 @@ "[Step 0] Sample/num_samples: 2\n", "[Step 0] Sample/self.n_epochs: 0\n", "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n", - "[Step 0] \u001b[91mParameter/__code:3: import numpy as np\n", + "[Step 0] \u001b[91mParameter/__code:4: import numpy as np\n", "import math\n", "def pack_circles(n: int) -> np.ndarray:\n", " \"\"\"\n", @@ -525,27 +539,27 @@ "\n", " return np.array(circles)\u001b[0m\n", "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: 1.375582371483138\n", - "[Step 1] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n", + "[Step 1] Test/test_score: -1000000.0\n", + "[Step 1] \u001b[94mAlgo/Average train score: -499999.48\u001b[0m\n", "[Step 1] Update/n_iters: 1\n", "[Step 1] Update/short_term_memory_size: 0\n", "[Step 1] Update/long_term_memory_size: 5\n", "[Step 1] Update/using_short_term_memory: False\n", "[Step 1] Update/using_long_term_memory: True\n", "[Step 1] Update/total_samples: 6\n", - "[Step 1] Update/best_candidate_priority: 1.375582371483138\n", - "[Step 1] Update/best_candidate_mean_score: 1.375582371483138\n", + "[Step 1] Update/best_candidate_priority: 1.3000000000000003\n", + "[Step 1] Update/best_candidate_mean_score: 1.3000000000000003\n", "[Step 1] Update/best_candidate_num_rollouts: 1\n", "[Step 1] Update/num_exploration_candidates: 2\n", - "[Step 1] Update/exploration_candidates_mean_priority: 1.0407921408122753\n", - "[Step 1] Update/exploration_candidates_mean_score: 1.0407921408122753\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.04\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.04\n", "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", - "[Step 1] Sample/mean_score: -1000000.0\n", + "[Step 1] Sample/mean_score: 1.04\n", "[Step 1] Sample/num_samples: 2\n", "[Step 1] Sample/self.n_epochs: 1\n", "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code:3: import numpy as np\n", - "import random\n", + "[Step 1] \u001b[91mParameter/__code:4: import numpy as np\n", + "import math\n", "\n", "def pack_circles(n: int) -> np.ndarray:\n", " \"\"\"\n", @@ -561,80 +575,80 @@ " \n", " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", " \"\"\"\n", - "\n", - " random.seed(2025)\n", " np.random.seed(2025)\n", - "\n", " circles = []\n", - " radii = np.random.uniform(0.01, 0.1, size=n) # Random radii between 0.01 and 0.1\n", + " radius = 0.05 # Starting with a smaller radius for each circle.\n", "\n", " for _ in range(n):\n", - " placed = False\n", - " while not placed:\n", - " radius = np.random.choice(radii)\n", + " while True:\n", " x = np.random.uniform(radius, 1 - radius)\n", " y = np.random.uniform(radius, 1 - radius)\n", - " overlap = False\n", - " \n", " # Check for overlap\n", - " for circle in circles:\n", - " if np.sqrt((circle[0] - x) ** 2 + (circle[1] - y) ** 2) < (circle[2] + radius):\n", - " overlap = True\n", - " break\n", - " \n", - " if not overlap:\n", + " if all(math.sqrt((x - cx) ** 2 + (y - cy) ** 2) >= 2 * radius for cx, cy, _ in circles):\n", " circles.append([x, y, radius])\n", - " placed = True\n", + " break\n", "\n", " return np.array(circles)\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: -1000000.0\u001b[0m\n" + "[Step 1] \u001b[92mGEPA(base) best mean: 1.063446105401886\u001b[0m\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ - "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00\n", + "
\n", "
\n", "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
run_idjob_idtask_idsuitetrainer_idseedstatusscore_initialscore_finalscore_besttime_secondsresolved_trainer_kwargsresolved_optimizer_kwargseval_kwargsfeedbacktb_logdir
020260211-104930-de435ae56f3619dd9ae0internal:code_paraminternalPrioritySearch123ok1.01.01.07.705247{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/6f3619dd9ae0/tb
120260211-104930-de435ae5c486ba93400finternal:code_paraminternalGEPA-Base123ok1.01.01.00.625392{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/c486ba93400f/tb
220260211-104930-de435ae5778da61d2682internal:numeric_paraminternalPrioritySearch123ok-3.0-0.0-0.010.472214{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/778da61d2682/tb
320260211-104930-de435ae54b3a7f322126internal:numeric_paraminternalGEPA-Base123ok-3.0-0.0-0.03.767528{\"merge_every\": 2, \"num_iters\": 1, \"pareto_sub...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/4b3a7f322126/tb
420260211-104930-de435ae50bfef35f6ef3internal:multi_paraminternalPrioritySearch123ok-1.0-0.0-0.04.724452{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Make a+b matc...{\"timeout_seconds\": 10}target=3.0jobs/0bfef35f6ef3/tb
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - "
\n" + "source": [ + "%%bash\n", + "set -euo pipefail\n", + "cd /content/Trace-Bench\n", + "\n", + "echo \"=== List trainers ===\"\n", + "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench list-trainers\n", + "\n", + "echo \"\"\n", + "echo \"=== Validate config (strict) ===\"\n", + "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench validate --config configs/m1_validation.yaml --strict --runs-dir \"$RUNS_DIR\"\n", + "\n", + "echo \"\"\n", + "echo \"=== Generate M1 run config (mode=$TB_MODE) ===\"\n", + "cat > /content/m1_run.yaml < np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - " \n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - " \n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - "\n", - " grid_size = int(np.ceil(np.sqrt(n)))\n", - " radius = 0.5 / grid_size\n", - "\n", - " circles = []\n", - " for i in range(n):\n", - " row = i // grid_size\n", - " col = i % grid_size\n", - " x = (col + 0.5) / grid_size\n", - " y = (row + 0.5) / grid_size\n", - " circles.append([x, y, radius])\n", - "\n", - " return np.array(circles)\u001b[0m\n", - "Epoch: 0. Iteration: 1\n", - "[Step 1] Test/test_score: -1000000.0\n", - "[Step 1] \u001b[94mAlgo/Average train score: -749999.875\u001b[0m\n", - "[Step 1] Update/n_iters: 1\n", - "[Step 1] Update/short_term_memory_size: 0\n", - "[Step 1] Update/long_term_memory_size: 5\n", - "[Step 1] Update/using_short_term_memory: False\n", - "[Step 1] Update/using_long_term_memory: True\n", - "[Step 1] Update/total_samples: 6\n", - "[Step 1] Update/best_candidate_priority: 0.789046857069868\n", - "[Step 1] Update/best_candidate_mean_score: 0.789046857069868\n", - "[Step 1] Update/best_candidate_num_rollouts: 1\n", - "[Step 1] Update/num_exploration_candidates: 2\n", - "[Step 1] Update/exploration_candidates_mean_priority: 0.6445234285349339\n", - "[Step 1] Update/exploration_candidates_mean_score: 0.6445234285349339\n", - "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", - "[Step 1] Sample/mean_score: -499999.75\n", - "[Step 1] Sample/num_samples: 2\n", - "[Step 1] Sample/self.n_epochs: 1\n", - "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", - "[Step 1] \u001b[91mParameter/__code:1: import numpy as np\n", - "import random\n", - "\n", - "def pack_circles(n: int) -> np.ndarray:\n", - " \"\"\"\n", - " Pack n circles in a unit square to maximize sum of radii.\n", - " \n", - " Args:\n", - " n: Number of circles to pack\n", - "\n", - " Returns:\n", - " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", - " All values should be between 0 and 1\n", - " Circles must not overlap\n", - " \n", - " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", - " \"\"\"\n", - " np.random.seed(2025)\n", - " random.seed(2025)\n", - "\n", - " circles = []\n", - " attempts = 0\n", - " max_attempts = 10000\n", - " \n", - " while len(circles) < n and attempts < max_attempts:\n", - " radius = random.uniform(0.01, 0.05)\n", - " x = random.uniform(radius, 1 - radius)\n", - " y = random.uniform(radius, 1 - radius)\n", - " \n", - " new_circle = (x, y, radius)\n", - " if all(np.linalg.norm(np.array(new_circle[:2]) - np.array(existing_circle[:2])) >= (new_circle[2] + existing_circle[2]) for existing_circle in circles):\n", - " circles.append(new_circle)\n", - " \n", - " attempts += 1\n", - "\n", - " return np.array(circles)\u001b[0m\n", - "[Step 1] \u001b[92mGEPA(base) best mean: 0.8402514352519977\u001b[0m\n" - ] + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "ckY1HmQam0UU", + "outputId": "6bb25555-b88d-456d-e1ba-02276fdad7af", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 764 + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Run dir: /content/drive/MyDrive/bench/2026-02-11/trace_bench/20260211-131540-093c5358\n", + "run_id: 20260211-131540-093c5358\n", + "runs_dir: /content/drive/MyDrive/bench/2026-02-11/trace_bench\n", + "mode: real\n", + "seeds:\n", + "- 123\n", + "max_workers: 1\n", + "fail_fast: false\n", + "tasks:\n", + "- id: internal:code_param\n", + " eval_kwargs:\n", + " timeout_seconds: 10\n", + "- id: internal:numeric_param\n", + " eval_kwargs:\n", + " timeout_seconds: 10\n", + "- id: internal:multi_param\n", + " eval_kwargs:\n", + " timeout_seconds: 10\n", + "- id: internal:non_trainable\n", + " eval_kwargs:\n", + "Jobs in manifest: 14\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " run_id job_id task_id suite \\\n", + "0 20260211-131540-093c5358 741cb015f747 internal:code_param internal \n", + "1 20260211-131540-093c5358 deec0f7230de internal:code_param internal \n", + "2 20260211-131540-093c5358 84b26f14a134 internal:numeric_param internal \n", + "3 20260211-131540-093c5358 2cdd86425cca internal:numeric_param internal \n", + "4 20260211-131540-093c5358 9531e7285512 internal:multi_param internal \n", + "\n", + " trainer_id seed status score_initial score_final score_best \\\n", + "0 PrioritySearch 123 ok 1.0 1.0 1.0 \n", + "1 GEPA-Base 123 ok 1.0 1.0 1.0 \n", + "2 PrioritySearch 123 ok -3.0 -0.0 -0.0 \n", + "3 GEPA-Base 123 ok -0.0 -0.0 -0.0 \n", + "4 PrioritySearch 123 ok -1.0 -0.0 -0.0 \n", + "\n", + " time_seconds resolved_trainer_kwargs \\\n", + "0 4.113878 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", + "1 2.601084 {\"merge_every\": 2, \"num_iters\": 1, \"num_thread... \n", + "2 6.988559 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", + "3 4.882765 {\"merge_every\": 2, \"num_iters\": 1, \"num_thread... \n", + "4 9.249504 {\"memory_update_frequency\": 1, \"num_batches\": ... \n", + "\n", + " resolved_optimizer_kwargs eval_kwargs \\\n", + "0 {\"memory_size\": 5, \"objective\": \"Match the tar... {\"timeout_seconds\": 10} \n", + "1 {\"memory_size\": 5, \"objective\": \"Match the tar... {\"timeout_seconds\": 10} \n", + "2 {\"memory_size\": 5, \"objective\": \"Match the num... {\"timeout_seconds\": 10} \n", + "3 {\"memory_size\": 5, \"objective\": \"Match the num... {\"timeout_seconds\": 10} \n", + "4 {\"memory_size\": 5, \"objective\": \"Make a+b matc... {\"timeout_seconds\": 10} \n", + "\n", + " feedback tb_logdir \n", + "0 Correct jobs/741cb015f747/tb \n", + "1 Correct jobs/deec0f7230de/tb \n", + "2 target=3.0 jobs/84b26f14a134/tb \n", + "3 target=3.0 jobs/2cdd86425cca/tb \n", + "4 target=3.0 jobs/9531e7285512/tb " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
run_idjob_idtask_idsuitetrainer_idseedstatusscore_initialscore_finalscore_besttime_secondsresolved_trainer_kwargsresolved_optimizer_kwargseval_kwargsfeedbacktb_logdir
020260211-131540-093c5358741cb015f747internal:code_paraminternalPrioritySearch123ok1.01.01.04.113878{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/741cb015f747/tb
120260211-131540-093c5358deec0f7230deinternal:code_paraminternalGEPA-Base123ok1.01.01.02.601084{\"merge_every\": 2, \"num_iters\": 1, \"num_thread...{\"memory_size\": 5, \"objective\": \"Match the tar...{\"timeout_seconds\": 10}Correctjobs/deec0f7230de/tb
220260211-131540-093c535884b26f14a134internal:numeric_paraminternalPrioritySearch123ok-3.0-0.0-0.06.988559{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/84b26f14a134/tb
320260211-131540-093c53582cdd86425ccainternal:numeric_paraminternalGEPA-Base123ok-0.0-0.0-0.04.882765{\"merge_every\": 2, \"num_iters\": 1, \"num_thread...{\"memory_size\": 5, \"objective\": \"Match the num...{\"timeout_seconds\": 10}target=3.0jobs/2cdd86425cca/tb
420260211-131540-093c53589531e7285512internal:multi_paraminternalPrioritySearch123ok-1.0-0.0-0.09.249504{\"memory_update_frequency\": 1, \"num_batches\": ...{\"memory_size\": 5, \"objective\": \"Make a+b matc...{\"timeout_seconds\": 10}target=3.0jobs/9531e7285512/tb
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 14,\n \"fields\": [\n {\n \"column\": \"run_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"20260211-131540-093c5358\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"job_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"8538a43564b6\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"internal:code_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"trace_examples\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_initial\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 316227.80338516145,\n \"min\": -1000000.0,\n \"max\": 1.063446105401886,\n \"num_unique_values\": 6,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_final\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5618951165634496,\n \"min\": -0.0,\n \"max\": 1.3509495181645703,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5618951165634496,\n \"min\": -0.0,\n \"max\": 1.3509495181645703,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"time_seconds\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8.042801912329264,\n \"min\": 0.000113,\n \"max\": 30.771424,\n \"num_unique_values\": 14,\n \"samples\": [\n 0.604331\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_trainer_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"{\\\"merge_every\\\": 2, \\\"num_iters\\\": 1, \\\"num_threads\\\": 2, \\\"pareto_subset_size\\\": 2, \\\"train_batch_size\\\": 2}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"resolved_optimizer_kwargs\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"{\\\"memory_size\\\": 5, \\\"objective\\\": \\\"Match the target code exactly.\\\"}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"eval_kwargs\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"{\\\"timeout_seconds\\\": 10}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"feedback\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Correct\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tb_logdir\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"jobs/8538a43564b6/tb\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "# Inspect latest run artifacts\n", + "import pathlib, json, pandas as pd\n", + "\n", + "runs_root = pathlib.Path(RUNS_DIR)\n", + "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", + "\n", + "run_dir = None\n", + "for p in reversed(candidates):\n", + " if (p / \"meta\" / \"config.snapshot.yaml\").exists():\n", + " run_dir = p\n", + " break\n", + "\n", + "if run_dir is None:\n", + " for p in reversed(candidates):\n", + " if (p / \"config.snapshot.yaml\").exists():\n", + " run_dir = p\n", + " break\n", + "\n", + "if run_dir is None:\n", + " raise FileNotFoundError(\"No run folder with config snapshot found under RUNS_DIR\")\n", + "\n", + "print(\"Run dir:\", run_dir)\n", + "\n", + "config_path = run_dir / \"meta\" / \"config.snapshot.yaml\"\n", + "env_path = run_dir / \"meta\" / \"env.json\"\n", + "manifest_path = run_dir / \"meta\" / \"manifest.json\"\n", + "\n", + "if not config_path.exists():\n", + " config_path = run_dir / \"config.snapshot.yaml\"\n", + " env_path = run_dir / \"env.json\"\n", + "\n", + "config_text = config_path.read_text()\n", + "print(config_text[:400])\n", + "\n", + "if manifest_path.exists():\n", + " manifest = json.loads(manifest_path.read_text())\n", + " print(\"Jobs in manifest:\", len(manifest.get(\"jobs\", [])))\n", + "\n", + "df = pd.read_csv(run_dir / \"results.csv\")\n", + "df.head()\n" + ], + "id": "ckY1HmQam0UU" }, { - "output_type": "stream", - "name": "stderr", - "text": [ - "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_matrix.yaml < np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + "\n", + " grid_size = int(np.ceil(np.sqrt(n)))\n", + " radius = 0.5 / grid_size\n", + "\n", + " circles = []\n", + " for i in range(n):\n", + " row = i // grid_size\n", + " col = i % grid_size\n", + " x = (col + 0.5) / grid_size\n", + " y = (row + 0.5) / grid_size\n", + " circles.append([x, y, radius])\n", + "\n", + " return np.array(circles)\u001b[0m\n", + "Epoch: 0. Iteration: 1\n", + "[Step 1] Test/test_score: 1.3000000000000003\n", + "[Step 1] \u001b[94mAlgo/Average train score: -499999.4091384736\u001b[0m\n", + "[Step 1] Update/n_iters: 1\n", + "[Step 1] Update/short_term_memory_size: 0\n", + "[Step 1] Update/long_term_memory_size: 5\n", + "[Step 1] Update/using_short_term_memory: False\n", + "[Step 1] Update/using_long_term_memory: True\n", + "[Step 1] Update/total_samples: 6\n", + "[Step 1] Update/best_candidate_priority: 1.3000000000000003\n", + "[Step 1] Update/best_candidate_mean_score: 1.3000000000000003\n", + "[Step 1] Update/best_candidate_num_rollouts: 1\n", + "[Step 1] Update/num_exploration_candidates: 2\n", + "[Step 1] Update/exploration_candidates_mean_priority: 1.181723052700943\n", + "[Step 1] Update/exploration_candidates_mean_score: 1.181723052700943\n", + "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n", + "[Step 1] Sample/mean_score: 1.181723052700943\n", + "[Step 1] Sample/num_samples: 2\n", + "[Step 1] Sample/self.n_epochs: 1\n", + "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n", + "[Step 1] \u001b[91mParameter/__code:1: import numpy as np\n", + "import math\n", + "\n", + "def pack_circles(n: int) -> np.ndarray:\n", + " \"\"\"\n", + " Pack n circles in a unit square to maximize sum of radii.\n", + " \n", + " Args:\n", + " n: Number of circles to pack\n", + "\n", + " Returns:\n", + " Numpy array of shape (n, 3) where each row is (x, y, radius)\n", + " All values should be between 0 and 1\n", + " Circles must not overlap\n", + " \n", + " Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n", + " \"\"\"\n", + " \n", + " np.random.seed(2025)\n", + " radius = 0.05 # Set a fixed radius for simplicity\n", + " circles = []\n", + " \n", + " for _ in range(n):\n", + " while True:\n", + " x = np.random.uniform(radius, 1 - radius)\n", + " y = np.random.uniform(radius, 1 - radius)\n", + " # Check for overlap\n", + " overlap = False\n", + " for circle in circles:\n", + " if ((x - circle[0]) ** 2 + (y - circle[1]) ** 2) < (2 * radius) ** 2:\n", + " overlap = True\n", + " break\n", + " if not overlap:\n", + " circles.append([x, y, radius])\n", + " break\n", + " \n", + " return np.array(circles)\u001b[0m\n", + "[Step 1] \u001b[92mGEPA(base) best mean: 1.3000000000000003\u001b[0m\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\rSampling training minibatch: Sampling 2 agents on 1 inputs: 0%| | 0/2 [00:00 /content/m1_matrix.yaml <\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
task_idsuitetrainer_idseedstatusscore_best
0internal:numeric_paraminternalPrioritySearch123ok-0.0
1internal:numeric_paraminternalGEPA-Base123ok-0.0
2llm4ad:circle_packingllm4adPrioritySearch123ok1.3
3llm4ad:circle_packingllm4adGEPA-Base123ok1.3
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df[[\\\"task_id\\\", \\\"suite\\\", \\\"trainer_id\\\", \\\"seed\\\", \\\"status\\\", \\\"score_best\\\"]]\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad:circle_packing\",\n \"internal:numeric_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad\",\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\",\n \"PrioritySearch\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.7505553499465139,\n \"min\": -0.0,\n \"max\": 1.3000000000000005,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.3000000000000005\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 6 + } ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
task_idsuitetrainer_idseedstatusscore_best
0internal:numeric_paraminternalPrioritySearch123ok-0.000000
1internal:numeric_paraminternalGEPA-Base123ok-0.000000
2llm4ad:circle_packingllm4adPrioritySearch123ok0.789047
3llm4ad:circle_packingllm4adGEPA-Base123ok0.840251
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - "
\n" + "source": [ + "# Verify 2x2 matrix: exactly 4 rows in results.csv\n", + "import json, pathlib, pandas as pd\n", + "\n", + "runs_root = pathlib.Path(RUNS_DIR)\n", + "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", + "\n", + "matrix_dir = None\n", + "for p in reversed(candidates):\n", + " summary_path = p / \"summary.json\"\n", + " if not summary_path.exists():\n", + " continue\n", + " try:\n", + " summary = json.loads(summary_path.read_text())\n", + " except Exception:\n", + " continue\n", + " if summary.get(\"total_jobs\") == 4:\n", + " matrix_dir = p\n", + " break\n", + "\n", + "if matrix_dir is None:\n", + " raise FileNotFoundError(\"No matrix run with total_jobs==4 found. Re-run the matrix cell.\")\n", + "\n", + "print(\"Matrix run dir:\", matrix_dir)\n", + "\n", + "df = pd.read_csv(matrix_dir / \"results.csv\")\n", + "print(f\"\\nresults.csv rows: {len(df)} (expected: 4)\")\n", + "assert len(df) == 4, f\"Expected 4 rows, got {len(df)}\"\n", + "\n", + "summary = json.loads((matrix_dir / \"summary.json\").read_text())\n", + "print(f\"summary.json: {summary}\")\n", + "assert summary.get(\"total_jobs\") == 4\n", + "\n", + "print(\"\\n--- Matrix results ---\")\n", + "df[[\"task_id\", \"suite\", \"trainer_id\", \"seed\", \"status\", \"score_best\"]]\n" ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "summary": "{\n \"name\": \"df[[\\\"task_id\\\", \\\"suite\\\", \\\"trainer_id\\\", \\\"seed\\\", \\\"status\\\", \\\"score_best\\\"]]\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"task_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad:circle_packing\",\n \"internal:numeric_param\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"suite\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"llm4ad\",\n \"internal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trainer_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"GEPA-Base\",\n \"PrioritySearch\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 123,\n \"max\": 123,\n \"num_unique_values\": 1,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"ok\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score_best\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.470802218117878,\n \"min\": -0.0,\n \"max\": 0.8402514352519977,\n \"num_unique_values\": 3,\n \"samples\": [\n -0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" - } - }, - "metadata": {}, - "execution_count": 6 + "id": "W18tGXfYm0UZ" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10" + }, + "colab": { + "provenance": [] } - ], - "source": [ - "# Verify 2x2 matrix: exactly 4 rows in results.csv\n", - "import json, pathlib, pandas as pd\n", - "\n", - "runs_root = pathlib.Path(RUNS_DIR)\n", - "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n", - "\n", - "matrix_dir = None\n", - "for p in reversed(candidates):\n", - " summary_path = p / \"summary.json\"\n", - " if not summary_path.exists():\n", - " continue\n", - " try:\n", - " summary = json.loads(summary_path.read_text())\n", - " except Exception:\n", - " continue\n", - " if summary.get(\"total_jobs\") == 4:\n", - " matrix_dir = p\n", - " break\n", - "\n", - "if matrix_dir is None:\n", - " raise FileNotFoundError(\"No matrix run with total_jobs==4 found. Re-run the matrix cell.\")\n", - "\n", - "print(\"Matrix run dir:\", matrix_dir)\n", - "\n", - "df = pd.read_csv(matrix_dir / \"results.csv\")\n", - "print(f\"\\nresults.csv rows: {len(df)} (expected: 4)\")\n", - "assert len(df) == 4, f\"Expected 4 rows, got {len(df)}\"\n", - "\n", - "summary = json.loads((matrix_dir / \"summary.json\").read_text())\n", - "print(f\"summary.json: {summary}\")\n", - "assert summary.get(\"total_jobs\") == 4\n", - "\n", - "print(\"\\n--- Matrix results ---\")\n", - "df[[\"task_id\", \"suite\", \"trainer_id\", \"seed\", \"status\", \"score_best\"]]\n" - ], - "id": "W18tGXfYm0UZ" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.10" }, - "colab": { - "provenance": [] - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } \ No newline at end of file