diff --git a/.gitignore b/.gitignore
index 83e4e75..4ef3b31 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,11 @@ __pycache__/
 external/*
 **/uv.lock
 *.egg-info/
-**/.venv/
\ No newline at end of file
+**/.venv/
+.env
+runs/
+runs_test/
+notebooks/01_smoke_runner_with_output.ipynb
+notebooks/01_m1_minimal_api_with_output.ipynb
+/.tmp_runs_run
+/.tmp_runs_validate
diff --git a/README.md b/README.md
index 3423365..bf586a2 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,70 @@ Currently, we are adding problems/domains one folder at a time.
 
 The instructions to run each task are located inside the task folder.
 
+## Quick Start (Runner/CLI)
+
+```bash
+# M1 review checklist (recommended order)
+# 1) List tasks (LLM4AD + example stubs)
+trace-bench list-tasks --root LLM4AD/benchmark_tasks
+
+# 2) Validate a config
+trace-bench validate --config configs/smoke.yaml
+
+# 3) Run Stub smoke (deterministic, no keys)
+trace-bench run --config configs/smoke.yaml --runs-dir runs
+
+# 4) Run Real smoke (requires OPENAI_API_KEY)
+trace-bench run --config configs/smoke_real.yaml --runs-dir runs
+
+# 5) Run tests (disable external plugin autoload)
+PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 pytest -q
+
+# List tasks (LLM4AD + example stubs)
+trace-bench list-tasks --root LLM4AD/benchmark_tasks
+
+# Validate a config
+trace-bench validate --config configs/smoke.yaml
+
+# Run a smoke benchmark
+trace-bench run --config configs/smoke.yaml
+
+# Launch UI (stub)
+trace-bench ui --runs-dir runs
+```
+
+Expected run artifacts:
+- `runs/<run_id>/config.snapshot.yaml`
+- `runs/<run_id>/env.json`
+- `runs/<run_id>/results.csv`
+- `runs/<run_id>/events.jsonl`
+- `runs/<run_id>/summary.json`
+- `runs/<run_id>/tb/`
+
+## M1 Dependencies (Required for Full Pass)
+
+System:
+- Graphviz (system package)
+
+Python:
+- `graphviz`, `pyyaml`, `pytest`, `numpy`, `matplotlib`, `litellm==1.75.0`
+
+OpenTrace examples strict smoke (for 100% pass):
+- `datasets`, `textgrad`, `dspy`, `autogen`, `python-dotenv`
+
+## OpenTrace Examples Smoke (100% Pass Mode)
+
+To enforce 100% example smoke in CI, run:
+```bash
+TRACE_BENCH_STRICT_EXAMPLES=1 PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 pytest -q
+```
+Without strict mode, the smoke test skips only when optional deps are missing.
+
+## VeriBench Status (In Scope, Pending Input)
+
+VeriBench is in scope but requires the Trace team to provide the task entrypoint/task list.
+CLI flags are ready (`--bench veribench`); when the entrypoint is unavailable, tasks are skipped with a structured reason rather than raising.
+
 ## Problem Sets
 
 ### General Problem Sets
@@ -27,9 +91,9 @@ Current implementation of graph is a single node.
 
 **Supported Algorithms:** PrioritySearch, GEPA-Base, GEPA-UCB, GEPA-Beam
 
-📖 **[See detailed usage guide →](LM4AD/readme.md)**
+**See detailed usage guide:** `LM4AD/readme.md`
 
 ## Agent Architecture
 - ReAct agent
 
-All the libraries from other repos are stored and managed in the `external` folder -- this folder will be created if one of the `install.sh` script is run inside the task folder.
\ No newline at end of file
+All the libraries from other repos are stored and managed in the `external` folder -- this folder will be created if one of the `install.sh` script is run inside the task folder.
diff --git a/configs/m1_matrix_smoke.yaml b/configs/m1_matrix_smoke.yaml
new file mode 100644
index 0000000..3ba1b6e
--- /dev/null
+++ b/configs/m1_matrix_smoke.yaml
@@ -0,0 +1,24 @@
+runs_dir: runs
+mode: stub
+seeds: [123]
+max_workers: 1
+fail_fast: false
+
+tasks:
+  - id: internal:numeric_param
+  - id: llm4ad:circle_packing
+    eval_kwargs:
+      timeout_seconds: 10
+
+trainers:
+  - id: PrioritySearch
+    params_variants:
+      - ps_steps: 1
+        ps_batches: 1
+
+  - id: GEPA-Base
+    params_variants:
+      - gepa_iters: 1
+        gepa_train_bs: 2
+        gepa_merge_every: 2
+        gepa_pareto_subset: 2
diff --git a/configs/m1_validation.yaml b/configs/m1_validation.yaml
new file mode 100644
index 0000000..fdbe511
--- /dev/null
+++ b/configs/m1_validation.yaml
@@ -0,0 +1,55 @@
+runs_dir: runs
+mode: stub
+seeds: [123]
+max_workers: 1
+fail_fast: false
+
+tasks:
+  - id: internal:code_param
+  - id: internal:numeric_param
+  - id: internal:multi_param
+  - id: internal:non_trainable
+  - id: trace_examples:greeting_stub
+  - id: llm4ad:circle_packing
+    eval_kwargs:
+      timeout_seconds: 10
+  - id: veribench:smoke_placeholder
+
+trainers:
+  - id: PrioritySearch
+    params_variants:
+      - threads: 2
+        ps_steps: 1
+        ps_batches: 1
+        ps_candidates: 2
+        ps_proposals: 2
+        ps_mem_update: 1
+
+  - id: GEPA-Base
+    params_variants:
+      - threads: 2
+        gepa_iters: 1
+        gepa_train_bs: 2
+        gepa_merge_every: 2
+        gepa_pareto_subset: 2
+    optimizer: OPROv2
+    optimizer_kwargs: {}
+
+  - id: GEPA-UCB
+    params_variants:
+      - threads: 2
+        gepa_iters: 1
+        gepa_train_bs: 2
+        gepa_merge_every: 2
+        gepa_pareto_subset: 2
+
+  - id: GEPA-Beam
+    params_variants:
+      - threads: 2
+        gepa_iters: 1
+        gepa_train_bs: 2
+        gepa_merge_every: 2
+        gepa_pareto_subset: 2
+
+eval_kwargs:
+  timeout_seconds: 10
diff --git a/configs/smoke.yaml b/configs/smoke.yaml
new file mode 100644
index 0000000..8455c9f
--- /dev/null
+++ b/configs/smoke.yaml
@@ -0,0 +1,12 @@
+runs_dir: runs
+mode: stub
+seeds: [123]
+
+tasks:
+  - id: internal:numeric_param
+
+trainers:
+  - id: PrioritySearch
+    params_variants:
+      - ps_steps: 1
+        ps_batches: 1
diff --git a/configs/smoke_real.yaml b/configs/smoke_real.yaml
new file mode 100644
index 0000000..2ebb27d
--- /dev/null
+++ b/configs/smoke_real.yaml
@@ -0,0 +1,12 @@
+runs_dir: runs
+mode: real
+seeds: [123]
+
+tasks:
+  - id: trace_examples:greeting_stub
+
+trainers:
+  - id: PrioritySearch
+    params_variants:
+      - ps_steps: 1
+        ps_batches: 1
diff --git a/notebooks/01_m1_minimal_api.ipynb b/notebooks/01_m1_minimal_api.ipynb
new file mode 100644
index 0000000..410c05e
--- /dev/null
+++ b/notebooks/01_m1_minimal_api.ipynb
@@ -0,0 +1,1634 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "euYNX4m-m0Ty"
+      },
+      "source": [
+        "# Trace-Bench M1 — Minimal API Validation\n",
+        "\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guru-code-expert/Trace-Bench/blob/m1/deliverable/notebooks/01_m1_minimal_api.ipynb)\n",
+        "\n",
+        "This notebook validates the **M1 contracts**: canonical artifacts, deterministic IDs, and minimal runnable coverage across benches.\n",
+        "\n",
+        "**Mode policy**: defaults to **real** (uses API key if present). If no key is found, falls back to **stub** with a clear warning and STUB label on outputs."
+      ],
+      "id": "euYNX4m-m0Ty"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "u5DVjcAAm0UH"
+      },
+      "source": [
+        "## Expected Outputs\n",
+        "\n",
+        "- A new `runs/<run_id>/` folder with `meta/` + `jobs/` layout.\n",
+        "- `meta/config.snapshot.yaml`, `meta/manifest.json`, `meta/env.json` exist.\n",
+        "- `results.csv` contains `status` values (`ok`/`failed`/`skipped`).\n",
+        "- Internal non-trainable job shows `status=failed` with reason.\n",
+        "- If running in **real** mode, artifacts show `mode=real` and LLM4AD task produces a score.\n",
+        "- **2x2 matrix smoke**: `results.csv` with exactly 4 rows from 2 tasks x 2 trainers x 1 seed."
+      ],
+      "id": "u5DVjcAAm0UH"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "8D3DGyVXm0UJ",
+        "outputId": "2b621443-f1f0-45c2-bbec-d8f0803ea933",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Mounted at /content/drive\n",
+            "Runs dir: /content/drive/MyDrive/bench/2026-02-11/trace_bench\n",
+            "API key found - running in REAL mode (model: openrouter/openai/gpt-4o-mini)\n",
+            "\n",
+            "Mode: real\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Mount Drive (optional) + compute persistent runs_dir + detect API key\n",
+        "from datetime import date\n",
+        "from pathlib import Path\n",
+        "import os\n",
+        "\n",
+        "try:\n",
+        "    from google.colab import drive\n",
+        "    drive.mount(\"/content/drive\")\n",
+        "except Exception:\n",
+        "    pass\n",
+        "\n",
+        "\n",
+        "def bench_dir(project=\"bench\", sub=\"trace_bench\", local=\"/content/bench\"):\n",
+        "    drive_root = Path(\"/content/drive/MyDrive\")\n",
+        "    root = drive_root if drive_root.is_dir() else Path(local)\n",
+        "    out = root / project / date.today().isoformat() / sub\n",
+        "    out.mkdir(parents=True, exist_ok=True)\n",
+        "    return str(out)\n",
+        "\n",
+        "RUNS_DIR = bench_dir()\n",
+        "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n",
+        "print(\"Runs dir:\", RUNS_DIR)\n",
+        "\n",
+        "# --- Auto-detect API key (real mode by default) ---\n",
+        "API_KEY = os.environ.get(\"OPENROUTER_API_KEY\", \"\")\n",
+        "if not API_KEY:\n",
+        "    try:\n",
+        "        from google.colab import userdata\n",
+        "        API_KEY = userdata.get(\"OPENROUTER_API_KEY\") or \"\"\n",
+        "    except Exception:\n",
+        "        pass\n",
+        "\n",
+        "MODEL = os.environ.get(\"OPENROUTER_MODEL\", \"openrouter/openai/gpt-4o-mini\")\n",
+        "\n",
+        "if API_KEY:\n",
+        "    os.environ[\"OPENROUTER_API_KEY\"] = API_KEY\n",
+        "    # Compatibility for OpenAI-style clients used internally by optimizers.\n",
+        "    os.environ[\"OPENAI_API_KEY\"] = API_KEY\n",
+        "    os.environ[\"OPENAI_API_BASE\"] = \"https://openrouter.ai/api/v1\"\n",
+        "    os.environ[\"OPENAI_BASE_URL\"] = \"https://openrouter.ai/api/v1\"\n",
+        "    os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n",
+        "    os.environ[\"TRACE_LITELLM_MODEL\"] = MODEL\n",
+        "    MODE = \"real\"\n",
+        "    print(f\"API key found - running in REAL mode (model: {MODEL})\")\n",
+        "else:\n",
+        "    MODE = \"stub\"\n",
+        "    print(\"WARNING: No OPENROUTER_API_KEY found. Falling back to STUB mode.\")\n",
+        "    print(\"         All outputs below are labeled STUB - not real LLM results.\")\n",
+        "\n",
+        "os.environ[\"TB_MODE\"] = MODE\n",
+        "print(f\"\\nMode: {MODE}\")"
+      ],
+      "id": "8D3DGyVXm0UJ"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "swOi3Bhtm0UQ",
+        "outputId": "318e618c-53c0-407e-d757-0ade4d0b5ff1",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Cloning into 'Trace-Bench'...\n",
+            "remote: Enumerating objects: 317, done.\u001b[K\n",
+            "remote: Counting objects: 100% (317/317), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (219/219), done.\u001b[K\n",
+            "remote: Total 317 (delta 42), reused 282 (delta 41), pack-reused 0 (from 0)\u001b[K\n",
+            "Receiving objects: 100% (317/317), 3.85 MiB | 15.92 MiB/s, done.\n",
+            "Resolving deltas: 100% (42/42), done.\n",
+            "Cloning into 'OpenTrace'...\n",
+            "remote: Enumerating objects: 228, done.\u001b[K\n",
+            "remote: Counting objects: 100% (228/228), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (205/205), done.\u001b[K\n",
+            "remote: Total 228 (delta 17), reused 115 (delta 13), pack-reused 0 (from 0)\u001b[K\n",
+            "Receiving objects: 100% (228/228), 4.73 MiB | 9.13 MiB/s, done.\n",
+            "Resolving deltas: 100% (17/17), done.\n",
+            "/content/Trace-Bench\n",
+            "Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease\n",
+            "Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n",
+            "Get:3 https://cli.github.com/packages stable InRelease [3,917 B]\n",
+            "Get:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n",
+            "Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n",
+            "Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n",
+            "Get:7 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n",
+            "Get:8 https://cli.github.com/packages stable/main amd64 Packages [356 B]\n",
+            "Get:9 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [4,040 kB]\n",
+            "Get:10 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,609 kB]\n",
+            "Get:11 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [85.0 kB]\n",
+            "Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [70.9 kB]\n",
+            "Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [6,678 kB]\n",
+            "Get:14 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]\n",
+            "Get:15 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,895 kB]\n",
+            "Get:16 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]\n",
+            "Get:17 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,297 kB]\n",
+            "Get:18 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,728 kB]\n",
+            "Get:19 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [38.8 kB]\n",
+            "Get:20 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy/main amd64 Packages [75.3 kB]\n",
+            "Get:21 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,683 kB]\n",
+            "Get:22 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [6,396 kB]\n",
+            "Get:23 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [62.6 kB]\n",
+            "Fetched 37.1 MB in 4s (9,313 kB/s)\n",
+            "Reading package lists... Done\n",
+            "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n",
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "graphviz is already the newest version (2.42.2-6ubuntu0.1).\n",
+            "0 upgraded, 0 newly installed, 0 to remove and 57 not upgraded.\n",
+            "Requirement already satisfied: pip in /usr/local/lib/python3.12/dist-packages (24.1.2)\n",
+            "Collecting pip\n",
+            "  Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)\n",
+            "Downloading pip-26.0.1-py3-none-any.whl (1.8 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m19.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hInstalling collected packages: pip\n",
+            "  Attempting uninstall: pip\n",
+            "    Found existing installation: pip 24.1.2\n",
+            "    Uninstalling pip-24.1.2:\n",
+            "      Successfully uninstalled pip-24.1.2\n",
+            "Successfully installed pip-26.0.1\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (6.0.3)\n",
+            "Requirement already satisfied: pytest in /usr/local/lib/python3.12/dist-packages (8.4.2)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n",
+            "Requirement already satisfied: graphviz in /usr/local/lib/python3.12/dist-packages (0.21)\n",
+            "Collecting litellm==1.75.0\n",
+            "  Downloading litellm-1.75.0-py3-none-any.whl.metadata (40 kB)\n",
+            "Requirement already satisfied: aiohttp>=3.10 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.13.3)\n",
+            "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.3.1)\n",
+            "Requirement already satisfied: httpx>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.28.1)\n",
+            "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (8.7.1)\n",
+            "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (3.1.6)\n",
+            "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (4.26.0)\n",
+            "Requirement already satisfied: openai>=1.68.2 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.17.0)\n",
+            "Requirement already satisfied: pydantic<3.0.0,>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (2.12.3)\n",
+            "Requirement already satisfied: python-dotenv>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (1.2.1)\n",
+            "Requirement already satisfied: tiktoken>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.12.0)\n",
+            "Requirement already satisfied: tokenizers in /usr/local/lib/python3.12/dist-packages (from litellm==1.75.0) (0.22.2)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2<4.0.0,>=3.1.2->litellm==1.75.0) (3.0.3)\n",
+            "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (25.4.0)\n",
+            "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (2025.9.1)\n",
+            "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.37.0)\n",
+            "Requirement already satisfied: rpds-py>=0.25.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema<5.0.0,>=4.22.0->litellm==1.75.0) (0.30.0)\n",
+            "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.7.0)\n",
+            "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (2.41.4)\n",
+            "Requirement already satisfied: typing-extensions>=4.14.1 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (4.15.0)\n",
+            "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.5.0->litellm==1.75.0) (0.4.2)\n",
+            "Requirement already satisfied: iniconfig>=1 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.3.0)\n",
+            "Requirement already satisfied: packaging>=20 in /usr/local/lib/python3.12/dist-packages (from pytest) (26.0)\n",
+            "Requirement already satisfied: pluggy<2,>=1.5 in /usr/local/lib/python3.12/dist-packages (from pytest) (1.6.0)\n",
+            "Requirement already satisfied: pygments>=2.7.2 in /usr/local/lib/python3.12/dist-packages (from pytest) (2.19.2)\n",
+            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n",
+            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n",
+            "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n",
+            "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n",
+            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n",
+            "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (2.9.0.post0)\n",
+            "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (2.6.1)\n",
+            "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.4.0)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.8.0)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (6.7.1)\n",
+            "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (0.4.1)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp>=3.10->litellm==1.75.0) (1.22.0)\n",
+            "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.12/dist-packages (from yarl<2.0,>=1.17.0->aiohttp>=3.10->litellm==1.75.0) (3.11)\n",
+            "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (4.12.1)\n",
+            "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (2026.1.4)\n",
+            "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx>=0.23.0->litellm==1.75.0) (1.0.9)\n",
+            "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx>=0.23.0->litellm==1.75.0) (0.16.0)\n",
+            "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.12/dist-packages (from importlib-metadata>=6.8.0->litellm==1.75.0) (3.23.0)\n",
+            "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.9.0)\n",
+            "Requirement already satisfied: jiter<1,>=0.10.0 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (0.13.0)\n",
+            "Requirement already satisfied: sniffio in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (1.3.1)\n",
+            "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.12/dist-packages (from openai>=1.68.2->litellm==1.75.0) (4.67.3)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n",
+            "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2025.11.3)\n",
+            "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.12/dist-packages (from tiktoken>=0.7.0->litellm==1.75.0) (2.32.4)\n",
+            "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (3.4.4)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm==1.75.0) (2.5.0)\n",
+            "Requirement already satisfied: huggingface-hub<2.0,>=0.16.4 in /usr/local/lib/python3.12/dist-packages (from tokenizers->litellm==1.75.0) (1.4.0)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (3.20.3)\n",
+            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (2025.3.0)\n",
+            "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.2.0)\n",
+            "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (1.5.4)\n",
+            "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.16.4->tokenizers->litellm==1.75.0) (0.21.1)\n",
+            "Downloading litellm-1.75.0-py3-none-any.whl (8.9 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m62.3 MB/s\u001b[0m  \u001b[33m0:00:00\u001b[0m\n",
+            "\u001b[?25hInstalling collected packages: litellm\n",
+            "Successfully installed litellm-1.75.0\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n",
+        "!git clone --depth 1 --branch m1/deliverable https://github.com/guru-code-expert/Trace-Bench.git\n",
+        "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n",
+        "\n",
+        "%cd Trace-Bench\n",
+        "\n",
+        "# System + Python deps\n",
+        "!apt-get update -y && apt-get install -y graphviz\n",
+        "!python -m pip install -U pip\n",
+        "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0\n"
+      ],
+      "id": "swOi3Bhtm0UQ"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "a__iRJTHm0UR",
+        "outputId": "3f85acb0-c2be-4ae4-fcf7-3aea796bf95b",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "=== List trainers ===\n",
+            "AggregatedUpdate\tavailable\n",
+            "BasicSearchAlgorithm\tavailable\n",
+            "BeamSearch\tavailable\n",
+            "BeamsearchAlgorithm\tavailable\n",
+            "BeamsearchHistoryAlgorithm\tavailable\n",
+            "GEPA-Base\tavailable\n",
+            "GEPA-Beam\tavailable\n",
+            "GEPA-UCB\tavailable\n",
+            "Minibatch\tavailable\n",
+            "MinibatchAlgorithm\tavailable\n",
+            "PrioritySearch\tavailable\n",
+            "PrioritySearch_with_Regressor\tavailable\n",
+            "SearchTemplate\tavailable\n",
+            "SequentialSearch\tavailable\n",
+            "SequentialUpdate\tavailable\n",
+            "StreamingPrioritySearch\tavailable\n",
+            "UCBSearchAlgorithm\tavailable\n",
+            "\n",
+            "=== Validate config (strict) ===\n",
+            "[OK] internal:code_param\n",
+            "[OK] internal:numeric_param\n",
+            "[OK] internal:multi_param\n",
+            "[OK] internal:non_trainable\n",
+            "[EXPECTED] internal:non_trainable: no_trainable_parameters\n",
+            "[OK] trace_examples:greeting_stub\n",
+            "[OK] llm4ad:circle_packing\n",
+            "[SKIP] veribench:smoke_placeholder: veribench_unavailable: entrypoint not available (install Veribench or provide task list)\n",
+            "\n",
+            "[OK] matrix: 28 jobs expanded deterministically\n",
+            "  job 741cb015f747: internal:code_param x PrioritySearch (seed=123)\n",
+            "  job deec0f7230de: internal:code_param x GEPA-Base (seed=123)\n",
+            "  job 09eba11e01cf: internal:code_param x GEPA-UCB (seed=123)\n",
+            "  job 2baa9d102ae9: internal:code_param x GEPA-Beam (seed=123)\n",
+            "  job 84b26f14a134: internal:numeric_param x PrioritySearch (seed=123)\n",
+            "  job 2cdd86425cca: internal:numeric_param x GEPA-Base (seed=123)\n",
+            "  job 0fdc0343cc34: internal:numeric_param x GEPA-UCB (seed=123)\n",
+            "  job ce5b3461d160: internal:numeric_param x GEPA-Beam (seed=123)\n",
+            "  job 9531e7285512: internal:multi_param x PrioritySearch (seed=123)\n",
+            "  job e8011aad9336: internal:multi_param x GEPA-Base (seed=123)\n",
+            "  job ecd3fbbd3c42: internal:multi_param x GEPA-UCB (seed=123)\n",
+            "  job 2dd76882fd19: internal:multi_param x GEPA-Beam (seed=123)\n",
+            "  job d52d40ca6b77: internal:non_trainable x PrioritySearch (seed=123)\n",
+            "  job eb30b13f2e14: internal:non_trainable x GEPA-Base (seed=123)\n",
+            "  job c865b1ec0cbc: internal:non_trainable x GEPA-UCB (seed=123)\n",
+            "  job d870163c477d: internal:non_trainable x GEPA-Beam (seed=123)\n",
+            "  job 3a1216485e9b: trace_examples:greeting_stub x PrioritySearch (seed=123)\n",
+            "  job 8538a43564b6: trace_examples:greeting_stub x GEPA-Base (seed=123)\n",
+            "  job 28906417633f: trace_examples:greeting_stub x GEPA-UCB (seed=123)\n",
+            "  job 94315da580b9: trace_examples:greeting_stub x GEPA-Beam (seed=123)\n",
+            "  job 1dda87fd7ae7: llm4ad:circle_packing x PrioritySearch (seed=123)\n",
+            "  job 4e4ef0c85cf3: llm4ad:circle_packing x GEPA-Base (seed=123)\n",
+            "  job 977a714b5483: llm4ad:circle_packing x GEPA-UCB (seed=123)\n",
+            "  job 6f9dc2e38ac8: llm4ad:circle_packing x GEPA-Beam (seed=123)\n",
+            "  job e7fa76b4eab5: veribench:smoke_placeholder x PrioritySearch (seed=123)\n",
+            "  job 51c6a932b453: veribench:smoke_placeholder x GEPA-Base (seed=123)\n",
+            "  job e006c4e16c3b: veribench:smoke_placeholder x GEPA-UCB (seed=123)\n",
+            "  job a0147226edd9: veribench:smoke_placeholder x GEPA-Beam (seed=123)\n",
+            "\n",
+            "  tasks:    ['internal:code_param', 'internal:multi_param', 'internal:non_trainable', 'internal:numeric_param', 'llm4ad:circle_packing', 'trace_examples:greeting_stub', 'veribench:smoke_placeholder']\n",
+            "  trainers: ['GEPA-Base', 'GEPA-Beam', 'GEPA-UCB', 'PrioritySearch']\n",
+            "[OK] manifest written: /content/drive/MyDrive/bench/2026-02-11/trace_bench/20260211-131538-8e24e6b0/meta/manifest.json\n",
+            "\n",
+            "=== Generate M1 run config (mode=real) ===\n",
+            "Config mode: real\n",
+            "\n",
+            "=== Run M1 validation ===\n",
+            "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n",
+            "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n",
+            "Epoch: 0. Iteration: 0\n",
+            "[Step 0] Test/test_score: 1.0\n",
+            "[Step 0] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n",
+            "[Step 0] Update/n_iters: 0\n",
+            "[Step 0] Update/short_term_memory_size: 0\n",
+            "[Step 0] Update/long_term_memory_size: 2\n",
+            "[Step 0] Update/using_short_term_memory: False\n",
+            "[Step 0] Update/using_long_term_memory: True\n",
+            "[Step 0] Update/total_samples: 0\n",
+            "[Step 0] Update/best_candidate_priority: inf\n",
+            "[Step 0] Update/best_candidate_num_rollouts: 0\n",
+            "[Step 0] Update/num_exploration_candidates: 2\n",
+            "[Step 0] Update/exploration_candidates_mean_priority: inf\n",
+            "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n",
+            "[Step 0] Sample/mean_score: 1.0\n",
+            "[Step 0] Sample/num_samples: 2\n",
+            "[Step 0] Sample/self.n_epochs: 0\n",
+            "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n",
+            "        return code\u001b[0m\n",
+            "Epoch: 0. Iteration: 1\n",
+            "[Step 1] Test/test_score: 1.0\n",
+            "[Step 1] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n",
+            "[Step 1] Update/n_iters: 1\n",
+            "[Step 1] Update/short_term_memory_size: 0\n",
+            "[Step 1] Update/long_term_memory_size: 1\n",
+            "[Step 1] Update/using_short_term_memory: False\n",
+            "[Step 1] Update/using_long_term_memory: True\n",
+            "[Step 1] Update/total_samples: 2\n",
+            "[Step 1] Update/best_candidate_priority: 1.0\n",
+            "[Step 1] Update/best_candidate_mean_score: 1.0\n",
+            "[Step 1] Update/best_candidate_num_rollouts: 2\n",
+            "[Step 1] Update/num_exploration_candidates: 1\n",
+            "[Step 1] Update/exploration_candidates_mean_priority: 1.0\n",
+            "[Step 1] Update/exploration_candidates_mean_score: 1.0\n",
+            "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n",
+            "[Step 1] Sample/mean_score: 1.0\n",
+            "[Step 1] Sample/num_samples: 1\n",
+            "[Step 1] Sample/self.n_epochs: 1\n",
+            "[Step 1] \u001b[94mAlgo/Number of training samples: 3\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/str:0: def f(x): return x\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, code):\n",
+            "        return code\u001b[0m\n",
+            "[Step 1] \u001b[92mGEPA(base) best mean: 1.0\u001b[0m\n",
+            "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n",
+            "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n",
+            "Epoch: 0. Iteration: 0\n",
+            "[Step 0] Test/test_score: -3.0\n",
+            "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n",
+            "[Step 0] Update/n_iters: 0\n",
+            "[Step 0] Update/short_term_memory_size: 0\n",
+            "[Step 0] Update/long_term_memory_size: 2\n",
+            "[Step 0] Update/using_short_term_memory: False\n",
+            "[Step 0] Update/using_long_term_memory: True\n",
+            "[Step 0] Update/total_samples: 0\n",
+            "[Step 0] Update/best_candidate_priority: inf\n",
+            "[Step 0] Update/best_candidate_num_rollouts: 0\n",
+            "[Step 0] Update/num_exploration_candidates: 2\n",
+            "[Step 0] Update/exploration_candidates_mean_priority: inf\n",
+            "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n",
+            "[Step 0] Sample/mean_score: -3.0\n",
+            "[Step 0] Sample/num_samples: 2\n",
+            "[Step 0] Sample/self.n_epochs: 0\n",
+            "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n",
+            "        return value\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n",
+            "Epoch: 0. Iteration: 1\n",
+            "[Step 1] Test/test_score: 0.0\n",
+            "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n",
+            "[Step 1] Update/n_iters: 1\n",
+            "[Step 1] Update/short_term_memory_size: 0\n",
+            "[Step 1] Update/long_term_memory_size: 3\n",
+            "[Step 1] Update/using_short_term_memory: False\n",
+            "[Step 1] Update/using_long_term_memory: True\n",
+            "[Step 1] Update/total_samples: 6\n",
+            "[Step 1] Update/best_candidate_priority: 0.0\n",
+            "[Step 1] Update/best_candidate_mean_score: 0.0\n",
+            "[Step 1] Update/best_candidate_num_rollouts: 2\n",
+            "[Step 1] Update/num_exploration_candidates: 2\n",
+            "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n",
+            "[Step 1] Update/exploration_candidates_mean_score: 0.0\n",
+            "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n",
+            "[Step 1] Sample/mean_score: 0.0\n",
+            "[Step 1] Sample/num_samples: 2\n",
+            "[Step 1] Sample/self.n_epochs: 1\n",
+            "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/__code1_copy:0: def emit(self, value):\n",
+            "        return value\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n",
+            "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n",
+            "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n",
+            "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n",
+            "Epoch: 0. Iteration: 0\n",
+            "[Step 0] Test/test_score: -1.0\n",
+            "[Step 0] \u001b[94mAlgo/Average train score: -1.0\u001b[0m\n",
+            "[Step 0] Update/n_iters: 0\n",
+            "[Step 0] Update/short_term_memory_size: 0\n",
+            "[Step 0] Update/long_term_memory_size: 2\n",
+            "[Step 0] Update/using_short_term_memory: False\n",
+            "[Step 0] Update/using_long_term_memory: True\n",
+            "[Step 0] Update/total_samples: 0\n",
+            "[Step 0] Update/best_candidate_priority: inf\n",
+            "[Step 0] Update/best_candidate_num_rollouts: 0\n",
+            "[Step 0] Update/num_exploration_candidates: 2\n",
+            "[Step 0] Update/exploration_candidates_mean_priority: inf\n",
+            "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n",
+            "[Step 0] Sample/mean_score: -1.0\n",
+            "[Step 0] Sample/num_samples: 2\n",
+            "[Step 0] Sample/self.n_epochs: 0\n",
+            "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/float:1: 1.0\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/float:2: 1.0\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n",
+            "        return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n",
+            "Epoch: 0. Iteration: 1\n",
+            "[Step 1] Test/test_score: 0.0\n",
+            "[Step 1] \u001b[94mAlgo/Average train score: -0.5\u001b[0m\n",
+            "[Step 1] Update/n_iters: 1\n",
+            "[Step 1] Update/short_term_memory_size: 0\n",
+            "[Step 1] Update/long_term_memory_size: 3\n",
+            "[Step 1] Update/using_short_term_memory: False\n",
+            "[Step 1] Update/using_long_term_memory: True\n",
+            "[Step 1] Update/total_samples: 6\n",
+            "[Step 1] Update/best_candidate_priority: 0.0\n",
+            "[Step 1] Update/best_candidate_mean_score: 0.0\n",
+            "[Step 1] Update/best_candidate_num_rollouts: 2\n",
+            "[Step 1] Update/num_exploration_candidates: 2\n",
+            "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n",
+            "[Step 1] Update/exploration_candidates_mean_score: 0.0\n",
+            "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n",
+            "[Step 1] Sample/mean_score: 0.0\n",
+            "[Step 1] Sample/num_samples: 2\n",
+            "[Step 1] Sample/self.n_epochs: 1\n",
+            "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/float:1: 2.0\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/float:2: 1.0\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/__code2_copy:0: def combine(self, a, b):\n",
+            "        return float(getattr(a, \"data\", a)) + float(getattr(b, \"data\", b))\u001b[0m\n",
+            "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n",
+            "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n",
+            "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n",
+            "Epoch: 0. Iteration: 0\n",
+            "[Step 0] Test/test_score: 1.0\n",
+            "[Step 0] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n",
+            "[Step 0] Update/n_iters: 0\n",
+            "[Step 0] Update/short_term_memory_size: 0\n",
+            "[Step 0] Update/long_term_memory_size: 2\n",
+            "[Step 0] Update/using_short_term_memory: False\n",
+            "[Step 0] Update/using_long_term_memory: True\n",
+            "[Step 0] Update/total_samples: 0\n",
+            "[Step 0] Update/best_candidate_priority: inf\n",
+            "[Step 0] Update/best_candidate_num_rollouts: 0\n",
+            "[Step 0] Update/num_exploration_candidates: 2\n",
+            "[Step 0] Update/exploration_candidates_mean_priority: inf\n",
+            "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n",
+            "[Step 0] Sample/mean_score: 1.0\n",
+            "[Step 0] Sample/num_samples: 2\n",
+            "[Step 0] Sample/self.n_epochs: 0\n",
+            "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/__code3_copy:0: def compose(self, greeting, name: str):\n",
+            "        greeting_value = getattr(greeting, \"data\", greeting)\n",
+            "        return f\"{greeting_value}, {name}!\"\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/str:20: Hello\u001b[0m\n",
+            "Epoch: 0. Iteration: 1\n",
+            "[Step 1] Test/test_score: 1.0\n",
+            "[Step 1] \u001b[94mAlgo/Average train score: 1.0\u001b[0m\n",
+            "[Step 1] Update/n_iters: 1\n",
+            "[Step 1] Update/short_term_memory_size: 0\n",
+            "[Step 1] Update/long_term_memory_size: 1\n",
+            "[Step 1] Update/using_short_term_memory: False\n",
+            "[Step 1] Update/using_long_term_memory: True\n",
+            "[Step 1] Update/total_samples: 2\n",
+            "[Step 1] Update/best_candidate_priority: 1.0\n",
+            "[Step 1] Update/best_candidate_mean_score: 1.0\n",
+            "[Step 1] Update/best_candidate_num_rollouts: 2\n",
+            "[Step 1] Update/num_exploration_candidates: 1\n",
+            "[Step 1] Update/exploration_candidates_mean_priority: 1.0\n",
+            "[Step 1] Update/exploration_candidates_mean_score: 1.0\n",
+            "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n",
+            "[Step 1] Sample/mean_score: 1.0\n",
+            "[Step 1] Sample/num_samples: 1\n",
+            "[Step 1] Sample/self.n_epochs: 1\n",
+            "[Step 1] \u001b[94mAlgo/Number of training samples: 3\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/__code3_copy:0: def compose(self, greeting, name: str):\n",
+            "        greeting_value = getattr(greeting, \"data\", greeting)\n",
+            "        return f\"{greeting_value}, {name}!\"\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/str:20: Hello\u001b[0m\n",
+            "[Step 1] \u001b[92mGEPA(base) best mean: 1.0\u001b[0m\n",
+            "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n",
+            "PrioritySearch initialized with both short-term and long-term memory. Candidates will be merged into long-term memory every 1 iterations.\n",
+            "Epoch: 0. Iteration: 0\n",
+            "[Step 0] Test/test_score: -1000000.0\n",
+            "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n",
+            "[Step 0] Update/n_iters: 0\n",
+            "[Step 0] Update/short_term_memory_size: 0\n",
+            "[Step 0] Update/long_term_memory_size: 2\n",
+            "[Step 0] Update/using_short_term_memory: False\n",
+            "[Step 0] Update/using_long_term_memory: True\n",
+            "[Step 0] Update/total_samples: 0\n",
+            "[Step 0] Update/best_candidate_priority: inf\n",
+            "[Step 0] Update/best_candidate_num_rollouts: 0\n",
+            "[Step 0] Update/num_exploration_candidates: 2\n",
+            "[Step 0] Update/exploration_candidates_mean_priority: inf\n",
+            "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n",
+            "[Step 0] Sample/mean_score: -1000000.0\n",
+            "[Step 0] Sample/num_samples: 2\n",
+            "[Step 0] Sample/self.n_epochs: 0\n",
+            "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/__code:4: import numpy as np\n",
+            "import math\n",
+            "def pack_circles(n: int) -> np.ndarray:\n",
+            "    \"\"\"\n",
+            "    Pack n circles in a unit square to maximize sum of radii.\n",
+            "    \n",
+            "    Args:\n",
+            "        n: Number of circles to pack\n",
+            "\n",
+            "    Returns:\n",
+            "        Numpy array of shape (n, 3) where each row is (x, y, radius)\n",
+            "        All values should be between 0 and 1\n",
+            "        Circles must not overlap\n",
+            "        \n",
+            "    Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n",
+            "    \"\"\"\n",
+            "\n",
+            "    grid_size = int(np.ceil(np.sqrt(n)))\n",
+            "    radius = 0.5 / grid_size\n",
+            "\n",
+            "    circles = []\n",
+            "    for i in range(n):\n",
+            "        row = i // grid_size\n",
+            "        col = i % grid_size\n",
+            "        x = (col + 0.5) / grid_size\n",
+            "        y = (row + 0.5) / grid_size\n",
+            "        circles.append([x, y, radius])\n",
+            "\n",
+            "    return np.array(circles)\u001b[0m\n",
+            "Epoch: 0. Iteration: 1\n",
+            "[Step 1] Test/test_score: 1.063446105401886\n",
+            "[Step 1] \u001b[94mAlgo/Average train score: -499999.5240756148\u001b[0m\n",
+            "[Step 1] Update/n_iters: 1\n",
+            "[Step 1] Update/short_term_memory_size: 0\n",
+            "[Step 1] Update/long_term_memory_size: 5\n",
+            "[Step 1] Update/using_short_term_memory: False\n",
+            "[Step 1] Update/using_long_term_memory: True\n",
+            "[Step 1] Update/total_samples: 6\n",
+            "[Step 1] Update/best_candidate_priority: 1.063446105401886\n",
+            "[Step 1] Update/best_candidate_mean_score: 1.063446105401886\n",
+            "[Step 1] Update/best_candidate_num_rollouts: 1\n",
+            "[Step 1] Update/num_exploration_candidates: 2\n",
+            "[Step 1] Update/exploration_candidates_mean_priority: 0.9518487703269418\n",
+            "[Step 1] Update/exploration_candidates_mean_score: 0.9518487703269418\n",
+            "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n",
+            "[Step 1] Sample/mean_score: 0.9518487703269418\n",
+            "[Step 1] Sample/num_samples: 2\n",
+            "[Step 1] Sample/self.n_epochs: 1\n",
+            "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/__code:4: import numpy as np\n",
+            "import math\n",
+            "\n",
+            "def pack_circles(n: int) -> np.ndarray:\n",
+            "    \"\"\"\n",
+            "    Pack n circles in a unit square to maximize sum of radii.\n",
+            "    \n",
+            "    Args:\n",
+            "        n: Number of circles to pack\n",
+            "\n",
+            "    Returns:\n",
+            "        Numpy array of shape (n, 3) where each row is (x, y, radius)\n",
+            "        All values should be between 0 and 1\n",
+            "        Circles must not overlap\n",
+            "        \n",
+            "    Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n",
+            "    \"\"\"\n",
+            "\n",
+            "    np.random.seed(2025)\n",
+            "    circles = []\n",
+            "    attempts = 0\n",
+            "    max_attempts = 10000  # Limit attempts to prevent infinite loop\n",
+            "\n",
+            "    while len(circles) < n and attempts < max_attempts:\n",
+            "        radius = np.random.uniform(0.01, 0.1)  # Random radius\n",
+            "        x = np.random.uniform(radius, 1 - radius)\n",
+            "        y = np.random.uniform(radius, 1 - radius)\n",
+            "        \n",
+            "        # Check for overlap\n",
+            "        overlap = False\n",
+            "        for (cx, cy, cr) in circles:\n",
+            "            distance = math.sqrt((cx - x) ** 2 + (cy - y) ** 2)\n",
+            "            if distance < (cr + radius):\n",
+            "                overlap = True\n",
+            "                break\n",
+            "\n",
+            "        if not overlap:\n",
+            "            circles.append([x, y, radius])\n",
+            "\n",
+            "        attempts += 1\n",
+            "\n",
+            "    return np.array(circles)\u001b[0m\n",
+            "[Step 1] \u001b[92mGEPA(base) best mean: 1.3509495181645703\u001b[0m\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "\rSampling training minibatch: Sampling 2 agents on 1 inputs:   0%|          | 0/2 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 5979.05it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 5041.23it/s]\n",
+            "\rBackward:   0%|          | 0/2 [00:00<?, ?it/s]\rBackward: 100%|██████████| 2/2 [00:00<00:00, 1738.57it/s]\n",
+            "\rCalling optimizers: Generating 2 proposals for each of 2 batches:   0%|          | 0/4 [00:00<?, ?it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  25%|██▌       | 1/4 [00:00<00:01,  1.54it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  75%|███████▌  | 3/4 [00:01<00:00,  3.07it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:01<00:00,  3.67it/s]\n",
+            "\rValidating newly proposed candidates: Sampling 0 agents on 1 inputs: 0it [00:00, ?it/s]\rValidating newly proposed candidates: Sampling 0 agents on 1 inputs: 0it [00:00, ?it/s]\n",
+            "\rSampling training minibatch: Sampling 1 agents on 1 inputs:   0%|          | 0/1 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 1 agents on 1 inputs: 100%|██████████| 1/1 [00:00<00:00, 5570.12it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 6096.37it/s]\n",
+            "\rGEPA(base): seed eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): seed eval: 100%|██████████| 1/1 [00:00<00:00, 4860.14it/s]\n",
+            "\rGEPA forward (mutate parent):   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA forward (mutate parent): 100%|██████████| 1/1 [00:00<00:00, 3666.35it/s]\n",
+            "\rGEPA(base): child eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): child eval: 100%|██████████| 1/1 [00:00<00:00, 3334.10it/s]\n",
+            "\rSampling training minibatch: Sampling 2 agents on 1 inputs:   0%|          | 0/2 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 2197.12it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 5242.88it/s]\n",
+            "\rBackward:   0%|          | 0/2 [00:00<?, ?it/s]\rBackward: 100%|██████████| 2/2 [00:00<00:00, 1148.50it/s]\n",
+            "\rCalling optimizers: Generating 2 proposals for each of 2 batches:   0%|          | 0/4 [00:00<?, ?it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  25%|██▌       | 1/4 [00:02<00:08,  2.95s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  50%|█████     | 2/4 [00:03<00:03,  1.73s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  75%|███████▌  | 3/4 [00:06<00:02,  2.10s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:06<00:00,  1.42s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:06<00:00,  1.69s/it]\n",
+            "\rValidating newly proposed candidates: Sampling 4 agents on 1 inputs:   0%|          | 0/4 [00:00<?, ?it/s]\rValidating newly proposed candidates: Sampling 4 agents on 1 inputs: 100%|██████████| 4/4 [00:00<00:00, 7469.82it/s]\n",
+            "\rSampling training minibatch: Sampling 2 agents on 1 inputs:   0%|          | 0/2 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 2374.36it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 5447.15it/s]\n",
+            "\rGEPA(base): seed eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): seed eval: 100%|██████████| 1/1 [00:00<00:00, 4443.12it/s]\n",
+            "\rGEPA forward (mutate parent):   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA forward (mutate parent): 100%|██████████| 1/1 [00:00<00:00, 5433.04it/s]\n",
+            "\rGEPA(base): child eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): child eval: 100%|██████████| 1/1 [00:00<00:00, 2752.17it/s]\n",
+            "\rSampling training minibatch: Sampling 2 agents on 1 inputs:   0%|          | 0/2 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 8422.30it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 4599.02it/s]\n",
+            "\rBackward:   0%|          | 0/2 [00:00<?, ?it/s]\rBackward: 100%|██████████| 2/2 [00:00<00:00, 1397.17it/s]\n",
+            "\rCalling optimizers: Generating 2 proposals for each of 2 batches:   0%|          | 0/4 [00:00<?, ?it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  25%|██▌       | 1/4 [00:04<00:12,  4.03s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  50%|█████     | 2/4 [00:04<00:04,  2.11s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  75%|███████▌  | 3/4 [00:07<00:02,  2.21s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:09<00:00,  2.14s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:09<00:00,  2.29s/it]\n",
+            "\rValidating newly proposed candidates: Sampling 4 agents on 1 inputs:   0%|          | 0/4 [00:00<?, ?it/s]\rValidating newly proposed candidates: Sampling 4 agents on 1 inputs: 100%|██████████| 4/4 [00:00<00:00, 4055.41it/s]\n",
+            "\rSampling training minibatch: Sampling 2 agents on 1 inputs:   0%|          | 0/2 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 1963.63it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 4433.73it/s]\n",
+            "\rGEPA(base): seed eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): seed eval: 100%|██████████| 1/1 [00:00<00:00, 5447.15it/s]\n",
+            "\rGEPA forward (mutate parent):   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA forward (mutate parent): 100%|██████████| 1/1 [00:00<00:00, 4593.98it/s]\n",
+            "\rGEPA(base): child eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): child eval: 100%|██████████| 1/1 [00:00<00:00, 302.84it/s]\n",
+            "\rSampling training minibatch: Sampling 2 agents on 1 inputs:   0%|          | 0/2 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 2226.87it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 5426.01it/s]\n",
+            "\rBackward:   0%|          | 0/2 [00:00<?, ?it/s]\rBackward: 100%|██████████| 2/2 [00:00<00:00, 1487.61it/s]\n",
+            "\rCalling optimizers: Generating 2 proposals for each of 2 batches:   0%|          | 0/4 [00:00<?, ?it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  25%|██▌       | 1/4 [00:00<00:01,  2.47it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  50%|█████     | 2/4 [00:00<00:00,  2.97it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  75%|███████▌  | 3/4 [00:01<00:00,  2.38it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:01<00:00,  3.21it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:01<00:00,  2.95it/s]\n",
+            "\rValidating newly proposed candidates: Sampling 0 agents on 1 inputs: 0it [00:00, ?it/s]\rValidating newly proposed candidates: Sampling 0 agents on 1 inputs: 0it [00:00, ?it/s]\n",
+            "\rSampling training minibatch: Sampling 1 agents on 1 inputs:   0%|          | 0/1 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 1 agents on 1 inputs: 100%|██████████| 1/1 [00:00<00:00, 4750.06it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 5309.25it/s]\n",
+            "\rGEPA(base): seed eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): seed eval: 100%|██████████| 1/1 [00:00<00:00, 5242.88it/s]\n",
+            "\rGEPA forward (mutate parent):   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA forward (mutate parent): 100%|██████████| 1/1 [00:00<00:00, 5957.82it/s]\n",
+            "\rGEPA(base): child eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): child eval: 100%|██████████| 1/1 [00:00<00:00, 4911.36it/s]\n",
+            "\rSampling training minibatch: Sampling 2 agents on 1 inputs:   0%|          | 0/2 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 60.62it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 73.87it/s]\n",
+            "\rBackward:   0%|          | 0/2 [00:00<?, ?it/s]\rBackward: 100%|██████████| 2/2 [00:00<00:00, 1260.12it/s]\n",
+            "\rCalling optimizers: Generating 2 proposals for each of 2 batches:   0%|          | 0/4 [00:00<?, ?it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  25%|██▌       | 1/4 [00:09<00:29,  9.94s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  50%|█████     | 2/4 [00:10<00:09,  4.58s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  75%|███████▌  | 3/4 [00:18<00:05,  5.89s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:20<00:00,  4.50s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:20<00:00,  5.15s/it]\n",
+            "\rValidating newly proposed candidates: Sampling 4 agents on 1 inputs:   0%|          | 0/4 [00:00<?, ?it/s]\rValidating newly proposed candidates: Sampling 4 agents on 1 inputs:  25%|██▌       | 1/4 [00:10<00:30, 10.01s/it]\rValidating newly proposed candidates: Sampling 4 agents on 1 inputs: 100%|██████████| 4/4 [00:10<00:00,  2.51s/it]\n",
+            "\rSampling training minibatch: Sampling 2 agents on 1 inputs:   0%|          | 0/2 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 107.70it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 65.00it/s]\n",
+            "\rGEPA(base): seed eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): seed eval: 100%|██████████| 1/1 [00:00<00:00, 67.48it/s]\n",
+            "\rGEPA forward (mutate parent):   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA forward (mutate parent): 100%|██████████| 1/1 [00:00<00:00, 69.64it/s]\n",
+            "\rGEPA(base): child eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): child eval: 100%|██████████| 1/1 [00:00<00:00, 59.24it/s]\n"
+          ]
+        }
+      ],
+      "source": [
+        "%%bash\n",
+        "set -euo pipefail\n",
+        "cd /content/Trace-Bench\n",
+        "\n",
+        "echo \"=== List trainers ===\"\n",
+        "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench list-trainers\n",
+        "\n",
+        "echo \"\"\n",
+        "echo \"=== Validate config (strict) ===\"\n",
+        "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench validate --config configs/m1_validation.yaml --strict --runs-dir \"$RUNS_DIR\"\n",
+        "\n",
+        "echo \"\"\n",
+        "echo \"=== Generate M1 run config (mode=$TB_MODE) ===\"\n",
+        "cat > /content/m1_run.yaml <<YAML\n",
+        "runs_dir: runs\n",
+        "mode: $TB_MODE\n",
+        "seeds: [123]\n",
+        "max_workers: 1\n",
+        "fail_fast: false\n",
+        "\n",
+        "tasks:\n",
+        "  - id: internal:code_param\n",
+        "  - id: internal:numeric_param\n",
+        "  - id: internal:multi_param\n",
+        "  - id: internal:non_trainable\n",
+        "  - id: trace_examples:greeting_stub\n",
+        "  - id: llm4ad:circle_packing\n",
+        "    eval_kwargs:\n",
+        "      timeout_seconds: 10\n",
+        "  - id: veribench:smoke_placeholder\n",
+        "\n",
+        "trainers:\n",
+        "  - id: PrioritySearch\n",
+        "    params_variants:\n",
+        "      - threads: 2\n",
+        "        ps_steps: 1\n",
+        "        ps_batches: 1\n",
+        "        ps_candidates: 2\n",
+        "        ps_proposals: 2\n",
+        "        ps_mem_update: 1\n",
+        "\n",
+        "  - id: GEPA-Base\n",
+        "    params_variants:\n",
+        "      - threads: 2\n",
+        "        gepa_iters: 1\n",
+        "        gepa_train_bs: 2\n",
+        "        gepa_merge_every: 2\n",
+        "        gepa_pareto_subset: 2\n",
+        "    optimizer: OPROv2\n",
+        "    optimizer_kwargs: {}\n",
+        "\n",
+        "eval_kwargs:\n",
+        "  timeout_seconds: 10\n",
+        "YAML\n",
+        "\n",
+        "echo \"Config mode: $TB_MODE\"\n",
+        "if [ \"$TB_MODE\" = \"stub\" ]; then\n",
+        "    echo \"[STUB] Results below are from deterministic stub — not real LLM.\"\n",
+        "fi\n",
+        "\n",
+        "echo \"\"\n",
+        "echo \"=== Run M1 validation ===\"\n",
+        "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench run --config /content/m1_run.yaml --runs-dir \"$RUNS_DIR\""
+      ],
+      "id": "a__iRJTHm0UR"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "ckY1HmQam0UU",
+        "outputId": "6bb25555-b88d-456d-e1ba-02276fdad7af",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 764
+        }
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Run dir: /content/drive/MyDrive/bench/2026-02-11/trace_bench/20260211-131540-093c5358\n",
+            "run_id: 20260211-131540-093c5358\n",
+            "runs_dir: /content/drive/MyDrive/bench/2026-02-11/trace_bench\n",
+            "mode: real\n",
+            "seeds:\n",
+            "- 123\n",
+            "max_workers: 1\n",
+            "fail_fast: false\n",
+            "tasks:\n",
+            "- id: internal:code_param\n",
+            "  eval_kwargs:\n",
+            "    timeout_seconds: 10\n",
+            "- id: internal:numeric_param\n",
+            "  eval_kwargs:\n",
+            "    timeout_seconds: 10\n",
+            "- id: internal:multi_param\n",
+            "  eval_kwargs:\n",
+            "    timeout_seconds: 10\n",
+            "- id: internal:non_trainable\n",
+            "  eval_kwargs:\n",
+            "Jobs in manifest: 14\n"
+          ]
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "                     run_id        job_id                 task_id     suite  \\\n",
+              "0  20260211-131540-093c5358  741cb015f747     internal:code_param  internal   \n",
+              "1  20260211-131540-093c5358  deec0f7230de     internal:code_param  internal   \n",
+              "2  20260211-131540-093c5358  84b26f14a134  internal:numeric_param  internal   \n",
+              "3  20260211-131540-093c5358  2cdd86425cca  internal:numeric_param  internal   \n",
+              "4  20260211-131540-093c5358  9531e7285512    internal:multi_param  internal   \n",
+              "\n",
+              "       trainer_id  seed status  score_initial  score_final  score_best  \\\n",
+              "0  PrioritySearch   123     ok            1.0          1.0         1.0   \n",
+              "1       GEPA-Base   123     ok            1.0          1.0         1.0   \n",
+              "2  PrioritySearch   123     ok           -3.0         -0.0        -0.0   \n",
+              "3       GEPA-Base   123     ok           -0.0         -0.0        -0.0   \n",
+              "4  PrioritySearch   123     ok           -1.0         -0.0        -0.0   \n",
+              "\n",
+              "   time_seconds                            resolved_trainer_kwargs  \\\n",
+              "0      4.113878  {\"memory_update_frequency\": 1, \"num_batches\": ...   \n",
+              "1      2.601084  {\"merge_every\": 2, \"num_iters\": 1, \"num_thread...   \n",
+              "2      6.988559  {\"memory_update_frequency\": 1, \"num_batches\": ...   \n",
+              "3      4.882765  {\"merge_every\": 2, \"num_iters\": 1, \"num_thread...   \n",
+              "4      9.249504  {\"memory_update_frequency\": 1, \"num_batches\": ...   \n",
+              "\n",
+              "                           resolved_optimizer_kwargs              eval_kwargs  \\\n",
+              "0  {\"memory_size\": 5, \"objective\": \"Match the tar...  {\"timeout_seconds\": 10}   \n",
+              "1  {\"memory_size\": 5, \"objective\": \"Match the tar...  {\"timeout_seconds\": 10}   \n",
+              "2  {\"memory_size\": 5, \"objective\": \"Match the num...  {\"timeout_seconds\": 10}   \n",
+              "3  {\"memory_size\": 5, \"objective\": \"Match the num...  {\"timeout_seconds\": 10}   \n",
+              "4  {\"memory_size\": 5, \"objective\": \"Make a+b matc...  {\"timeout_seconds\": 10}   \n",
+              "\n",
+              "     feedback             tb_logdir  \n",
+              "0     Correct  jobs/741cb015f747/tb  \n",
+              "1     Correct  jobs/deec0f7230de/tb  \n",
+              "2  target=3.0  jobs/84b26f14a134/tb  \n",
+              "3  target=3.0  jobs/2cdd86425cca/tb  \n",
+              "4  target=3.0  jobs/9531e7285512/tb  "
+            ],
+            "text/html": [
+              "\n",
+              "  <div id=\"df-38b186aa-3ace-4690-8685-3f624af402c6\" class=\"colab-df-container\">\n",
+              "    <div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>run_id</th>\n",
+              "      <th>job_id</th>\n",
+              "      <th>task_id</th>\n",
+              "      <th>suite</th>\n",
+              "      <th>trainer_id</th>\n",
+              "      <th>seed</th>\n",
+              "      <th>status</th>\n",
+              "      <th>score_initial</th>\n",
+              "      <th>score_final</th>\n",
+              "      <th>score_best</th>\n",
+              "      <th>time_seconds</th>\n",
+              "      <th>resolved_trainer_kwargs</th>\n",
+              "      <th>resolved_optimizer_kwargs</th>\n",
+              "      <th>eval_kwargs</th>\n",
+              "      <th>feedback</th>\n",
+              "      <th>tb_logdir</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>20260211-131540-093c5358</td>\n",
+              "      <td>741cb015f747</td>\n",
+              "      <td>internal:code_param</td>\n",
+              "      <td>internal</td>\n",
+              "      <td>PrioritySearch</td>\n",
+              "      <td>123</td>\n",
+              "      <td>ok</td>\n",
+              "      <td>1.0</td>\n",
+              "      <td>1.0</td>\n",
+              "      <td>1.0</td>\n",
+              "      <td>4.113878</td>\n",
+              "      <td>{\"memory_update_frequency\": 1, \"num_batches\": ...</td>\n",
+              "      <td>{\"memory_size\": 5, \"objective\": \"Match the tar...</td>\n",
+              "      <td>{\"timeout_seconds\": 10}</td>\n",
+              "      <td>Correct</td>\n",
+              "      <td>jobs/741cb015f747/tb</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>20260211-131540-093c5358</td>\n",
+              "      <td>deec0f7230de</td>\n",
+              "      <td>internal:code_param</td>\n",
+              "      <td>internal</td>\n",
+              "      <td>GEPA-Base</td>\n",
+              "      <td>123</td>\n",
+              "      <td>ok</td>\n",
+              "      <td>1.0</td>\n",
+              "      <td>1.0</td>\n",
+              "      <td>1.0</td>\n",
+              "      <td>2.601084</td>\n",
+              "      <td>{\"merge_every\": 2, \"num_iters\": 1, \"num_thread...</td>\n",
+              "      <td>{\"memory_size\": 5, \"objective\": \"Match the tar...</td>\n",
+              "      <td>{\"timeout_seconds\": 10}</td>\n",
+              "      <td>Correct</td>\n",
+              "      <td>jobs/deec0f7230de/tb</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>20260211-131540-093c5358</td>\n",
+              "      <td>84b26f14a134</td>\n",
+              "      <td>internal:numeric_param</td>\n",
+              "      <td>internal</td>\n",
+              "      <td>PrioritySearch</td>\n",
+              "      <td>123</td>\n",
+              "      <td>ok</td>\n",
+              "      <td>-3.0</td>\n",
+              "      <td>-0.0</td>\n",
+              "      <td>-0.0</td>\n",
+              "      <td>6.988559</td>\n",
+              "      <td>{\"memory_update_frequency\": 1, \"num_batches\": ...</td>\n",
+              "      <td>{\"memory_size\": 5, \"objective\": \"Match the num...</td>\n",
+              "      <td>{\"timeout_seconds\": 10}</td>\n",
+              "      <td>target=3.0</td>\n",
+              "      <td>jobs/84b26f14a134/tb</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>20260211-131540-093c5358</td>\n",
+              "      <td>2cdd86425cca</td>\n",
+              "      <td>internal:numeric_param</td>\n",
+              "      <td>internal</td>\n",
+              "      <td>GEPA-Base</td>\n",
+              "      <td>123</td>\n",
+              "      <td>ok</td>\n",
+              "      <td>-0.0</td>\n",
+              "      <td>-0.0</td>\n",
+              "      <td>-0.0</td>\n",
+              "      <td>4.882765</td>\n",
+              "      <td>{\"merge_every\": 2, \"num_iters\": 1, \"num_thread...</td>\n",
+              "      <td>{\"memory_size\": 5, \"objective\": \"Match the num...</td>\n",
+              "      <td>{\"timeout_seconds\": 10}</td>\n",
+              "      <td>target=3.0</td>\n",
+              "      <td>jobs/2cdd86425cca/tb</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>20260211-131540-093c5358</td>\n",
+              "      <td>9531e7285512</td>\n",
+              "      <td>internal:multi_param</td>\n",
+              "      <td>internal</td>\n",
+              "      <td>PrioritySearch</td>\n",
+              "      <td>123</td>\n",
+              "      <td>ok</td>\n",
+              "      <td>-1.0</td>\n",
+              "      <td>-0.0</td>\n",
+              "      <td>-0.0</td>\n",
+              "      <td>9.249504</td>\n",
+              "      <td>{\"memory_update_frequency\": 1, \"num_batches\": ...</td>\n",
+              "      <td>{\"memory_size\": 5, \"objective\": \"Make a+b matc...</td>\n",
+              "      <td>{\"timeout_seconds\": 10}</td>\n",
+              "      <td>target=3.0</td>\n",
+              "      <td>jobs/9531e7285512/tb</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>\n",
+              "    <div class=\"colab-df-buttons\">\n",
+              "\n",
+              "  <div class=\"colab-df-container\">\n",
+              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-38b186aa-3ace-4690-8685-3f624af402c6')\"\n",
+              "            title=\"Convert this dataframe to an interactive table.\"\n",
+              "            style=\"display:none;\">\n",
+              "\n",
+              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
+              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
+              "  </svg>\n",
+              "    </button>\n",
+              "\n",
+              "  <style>\n",
+              "    .colab-df-container {\n",
+              "      display:flex;\n",
+              "      gap: 12px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert {\n",
+              "      background-color: #E8F0FE;\n",
+              "      border: none;\n",
+              "      border-radius: 50%;\n",
+              "      cursor: pointer;\n",
+              "      display: none;\n",
+              "      fill: #1967D2;\n",
+              "      height: 32px;\n",
+              "      padding: 0 0 0 0;\n",
+              "      width: 32px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert:hover {\n",
+              "      background-color: #E2EBFA;\n",
+              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "      fill: #174EA6;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-buttons div {\n",
+              "      margin-bottom: 4px;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert {\n",
+              "      background-color: #3B4455;\n",
+              "      fill: #D2E3FC;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert:hover {\n",
+              "      background-color: #434B5C;\n",
+              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
+              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
+              "      fill: #FFFFFF;\n",
+              "    }\n",
+              "  </style>\n",
+              "\n",
+              "    <script>\n",
+              "      const buttonEl =\n",
+              "        document.querySelector('#df-38b186aa-3ace-4690-8685-3f624af402c6 button.colab-df-convert');\n",
+              "      buttonEl.style.display =\n",
+              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "\n",
+              "      async function convertToInteractive(key) {\n",
+              "        const element = document.querySelector('#df-38b186aa-3ace-4690-8685-3f624af402c6');\n",
+              "        const dataTable =\n",
+              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
+              "                                                    [key], {});\n",
+              "        if (!dataTable) return;\n",
+              "\n",
+              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
+              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
+              "          + ' to learn more about interactive tables.';\n",
+              "        element.innerHTML = '';\n",
+              "        dataTable['output_type'] = 'display_data';\n",
+              "        await google.colab.output.renderOutput(dataTable, element);\n",
+              "        const docLink = document.createElement('div');\n",
+              "        docLink.innerHTML = docLinkHtml;\n",
+              "        element.appendChild(docLink);\n",
+              "      }\n",
+              "    </script>\n",
+              "  </div>\n",
+              "\n",
+              "\n",
+              "    </div>\n",
+              "  </div>\n"
+            ],
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "type": "dataframe",
+              "variable_name": "df",
+              "summary": "{\n  \"name\": \"df\",\n  \"rows\": 14,\n  \"fields\": [\n    {\n      \"column\": \"run_id\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 1,\n        \"samples\": [\n          \"20260211-131540-093c5358\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"job_id\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 14,\n        \"samples\": [\n          \"8538a43564b6\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"task_id\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"internal:code_param\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"suite\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 4,\n        \"samples\": [\n          \"trace_examples\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"trainer_id\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          \"GEPA-Base\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"seed\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 123,\n        \"max\": 123,\n        \"num_unique_values\": 1,\n        \"samples\": [\n          123\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"status\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"ok\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"score_initial\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 316227.80338516145,\n        \"min\": -1000000.0,\n        \"max\": 1.063446105401886,\n        \"num_unique_values\": 6,\n        \"samples\": [\n          1.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"score_final\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.5618951165634496,\n        \"min\": -0.0,\n        \"max\": 1.3509495181645703,\n        \"num_unique_values\": 4,\n        \"samples\": [\n          -0.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"score_best\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.5618951165634496,\n        \"min\": -0.0,\n        \"max\": 1.3509495181645703,\n        \"num_unique_values\": 4,\n        \"samples\": [\n          -0.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"time_seconds\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 8.042801912329264,\n        \"min\": 0.000113,\n        \"max\": 30.771424,\n        \"num_unique_values\": 14,\n        \"samples\": [\n          0.604331\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"resolved_trainer_kwargs\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          \"{\\\"merge_every\\\": 2, \\\"num_iters\\\": 1, \\\"num_threads\\\": 2, \\\"pareto_subset_size\\\": 2, \\\"train_batch_size\\\": 2}\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"resolved_optimizer_kwargs\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"{\\\"memory_size\\\": 5, \\\"objective\\\": \\\"Match the target code exactly.\\\"}\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"eval_kwargs\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 1,\n        \"samples\": [\n          \"{\\\"timeout_seconds\\\": 10}\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"feedback\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 6,\n        \"samples\": [\n          \"Correct\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"tb_logdir\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 14,\n        \"samples\": [\n          \"jobs/8538a43564b6/tb\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
+            }
+          },
+          "metadata": {},
+          "execution_count": 4
+        }
+      ],
+      "source": [
+        "# Inspect latest run artifacts\n",
+        "import pathlib, json, pandas as pd\n",
+        "\n",
+        "runs_root = pathlib.Path(RUNS_DIR)\n",
+        "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n",
+        "\n",
+        "run_dir = None\n",
+        "for p in reversed(candidates):\n",
+        "    if (p / \"meta\" / \"config.snapshot.yaml\").exists():\n",
+        "        run_dir = p\n",
+        "        break\n",
+        "\n",
+        "if run_dir is None:\n",
+        "    for p in reversed(candidates):\n",
+        "        if (p / \"config.snapshot.yaml\").exists():\n",
+        "            run_dir = p\n",
+        "            break\n",
+        "\n",
+        "if run_dir is None:\n",
+        "    raise FileNotFoundError(\"No run folder with config snapshot found under RUNS_DIR\")\n",
+        "\n",
+        "print(\"Run dir:\", run_dir)\n",
+        "\n",
+        "config_path = run_dir / \"meta\" / \"config.snapshot.yaml\"\n",
+        "env_path = run_dir / \"meta\" / \"env.json\"\n",
+        "manifest_path = run_dir / \"meta\" / \"manifest.json\"\n",
+        "\n",
+        "if not config_path.exists():\n",
+        "    config_path = run_dir / \"config.snapshot.yaml\"\n",
+        "    env_path = run_dir / \"env.json\"\n",
+        "\n",
+        "config_text = config_path.read_text()\n",
+        "print(config_text[:400])\n",
+        "\n",
+        "if manifest_path.exists():\n",
+        "    manifest = json.loads(manifest_path.read_text())\n",
+        "    print(\"Jobs in manifest:\", len(manifest.get(\"jobs\", [])))\n",
+        "\n",
+        "df = pd.read_csv(run_dir / \"results.csv\")\n",
+        "df.head()\n"
+      ],
+      "id": "ckY1HmQam0UU"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gpkb4-1Em0UW"
+      },
+      "source": [
+        "## 2x2 Bounded Matrix Smoke (Plan A+ Pareto)\n",
+        "\n",
+        "Run exactly **2 tasks x 2 trainers x 1 seed = 4 jobs** and verify `results.csv` has 4 rows."
+      ],
+      "id": "gpkb4-1Em0UW"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "dMn7PDVgm0UX",
+        "outputId": "c8da1604-eca7-44c4-9736-894cbf386f67",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "=== 2x2 Matrix Smoke (mode=real) ===\n",
+            "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n",
+            "PrioritySearch initialized with only long-term memory.\n",
+            "Epoch: 0. Iteration: 0\n",
+            "[Step 0] Test/test_score: -3.0\n",
+            "[Step 0] \u001b[94mAlgo/Average train score: -3.0\u001b[0m\n",
+            "[Step 0] Update/n_iters: 0\n",
+            "[Step 0] Update/short_term_memory_size: 0\n",
+            "[Step 0] Update/long_term_memory_size: 2\n",
+            "[Step 0] Update/using_short_term_memory: False\n",
+            "[Step 0] Update/using_long_term_memory: True\n",
+            "[Step 0] Update/total_samples: 0\n",
+            "[Step 0] Update/best_candidate_priority: inf\n",
+            "[Step 0] Update/best_candidate_num_rollouts: 0\n",
+            "[Step 0] Update/num_exploration_candidates: 2\n",
+            "[Step 0] Update/exploration_candidates_mean_priority: inf\n",
+            "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n",
+            "[Step 0] Sample/mean_score: -3.0\n",
+            "[Step 0] Sample/num_samples: 2\n",
+            "[Step 0] Sample/self.n_epochs: 0\n",
+            "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n",
+            "        return value\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/float:0: 0.0\u001b[0m\n",
+            "Epoch: 0. Iteration: 1\n",
+            "[Step 1] Test/test_score: 0.0\n",
+            "[Step 1] \u001b[94mAlgo/Average train score: -1.5\u001b[0m\n",
+            "[Step 1] Update/n_iters: 1\n",
+            "[Step 1] Update/short_term_memory_size: 0\n",
+            "[Step 1] Update/long_term_memory_size: 3\n",
+            "[Step 1] Update/using_short_term_memory: False\n",
+            "[Step 1] Update/using_long_term_memory: True\n",
+            "[Step 1] Update/total_samples: 6\n",
+            "[Step 1] Update/best_candidate_priority: 0.0\n",
+            "[Step 1] Update/best_candidate_mean_score: 0.0\n",
+            "[Step 1] Update/best_candidate_num_rollouts: 2\n",
+            "[Step 1] Update/num_exploration_candidates: 2\n",
+            "[Step 1] Update/exploration_candidates_mean_priority: 0.0\n",
+            "[Step 1] Update/exploration_candidates_mean_score: 0.0\n",
+            "[Step 1] Update/exploration_candidates_average_num_rollouts: 2.0\n",
+            "[Step 1] Sample/mean_score: 0.0\n",
+            "[Step 1] Sample/num_samples: 2\n",
+            "[Step 1] Sample/self.n_epochs: 1\n",
+            "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/__code0_copy:0: def emit(self, value):\n",
+            "        return value\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/float:0: 3.0\u001b[0m\n",
+            "[Step 1] \u001b[92mGEPA(base) best mean: 0.0\u001b[0m\n",
+            "Warning: You are using PrioritySearch trainer, which is an experimental feature. Please report any issues you encounter.\n",
+            "PrioritySearch initialized with only long-term memory.\n",
+            "Epoch: 0. Iteration: 0\n",
+            "[Step 0] Test/test_score: -1000000.0\n",
+            "[Step 0] \u001b[94mAlgo/Average train score: -1000000.0\u001b[0m\n",
+            "[Step 0] Update/n_iters: 0\n",
+            "[Step 0] Update/short_term_memory_size: 0\n",
+            "[Step 0] Update/long_term_memory_size: 2\n",
+            "[Step 0] Update/using_short_term_memory: False\n",
+            "[Step 0] Update/using_long_term_memory: True\n",
+            "[Step 0] Update/total_samples: 0\n",
+            "[Step 0] Update/best_candidate_priority: inf\n",
+            "[Step 0] Update/best_candidate_num_rollouts: 0\n",
+            "[Step 0] Update/num_exploration_candidates: 2\n",
+            "[Step 0] Update/exploration_candidates_mean_priority: inf\n",
+            "[Step 0] Update/exploration_candidates_average_num_rollouts: 0.0\n",
+            "[Step 0] Sample/mean_score: -1000000.0\n",
+            "[Step 0] Sample/num_samples: 2\n",
+            "[Step 0] Sample/self.n_epochs: 0\n",
+            "[Step 0] \u001b[94mAlgo/Number of training samples: 2\u001b[0m\n",
+            "[Step 0] \u001b[91mParameter/__code:1: import numpy as np\n",
+            "import math\n",
+            "def pack_circles(n: int) -> np.ndarray:\n",
+            "    \"\"\"\n",
+            "    Pack n circles in a unit square to maximize sum of radii.\n",
+            "    \n",
+            "    Args:\n",
+            "        n: Number of circles to pack\n",
+            "\n",
+            "    Returns:\n",
+            "        Numpy array of shape (n, 3) where each row is (x, y, radius)\n",
+            "        All values should be between 0 and 1\n",
+            "        Circles must not overlap\n",
+            "        \n",
+            "    Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n",
+            "    \"\"\"\n",
+            "\n",
+            "    grid_size = int(np.ceil(np.sqrt(n)))\n",
+            "    radius = 0.5 / grid_size\n",
+            "\n",
+            "    circles = []\n",
+            "    for i in range(n):\n",
+            "        row = i // grid_size\n",
+            "        col = i % grid_size\n",
+            "        x = (col + 0.5) / grid_size\n",
+            "        y = (row + 0.5) / grid_size\n",
+            "        circles.append([x, y, radius])\n",
+            "\n",
+            "    return np.array(circles)\u001b[0m\n",
+            "Epoch: 0. Iteration: 1\n",
+            "[Step 1] Test/test_score: 1.3000000000000003\n",
+            "[Step 1] \u001b[94mAlgo/Average train score: -499999.4091384736\u001b[0m\n",
+            "[Step 1] Update/n_iters: 1\n",
+            "[Step 1] Update/short_term_memory_size: 0\n",
+            "[Step 1] Update/long_term_memory_size: 5\n",
+            "[Step 1] Update/using_short_term_memory: False\n",
+            "[Step 1] Update/using_long_term_memory: True\n",
+            "[Step 1] Update/total_samples: 6\n",
+            "[Step 1] Update/best_candidate_priority: 1.3000000000000003\n",
+            "[Step 1] Update/best_candidate_mean_score: 1.3000000000000003\n",
+            "[Step 1] Update/best_candidate_num_rollouts: 1\n",
+            "[Step 1] Update/num_exploration_candidates: 2\n",
+            "[Step 1] Update/exploration_candidates_mean_priority: 1.181723052700943\n",
+            "[Step 1] Update/exploration_candidates_mean_score: 1.181723052700943\n",
+            "[Step 1] Update/exploration_candidates_average_num_rollouts: 1.0\n",
+            "[Step 1] Sample/mean_score: 1.181723052700943\n",
+            "[Step 1] Sample/num_samples: 2\n",
+            "[Step 1] Sample/self.n_epochs: 1\n",
+            "[Step 1] \u001b[94mAlgo/Number of training samples: 4\u001b[0m\n",
+            "[Step 1] \u001b[91mParameter/__code:1: import numpy as np\n",
+            "import math\n",
+            "\n",
+            "def pack_circles(n: int) -> np.ndarray:\n",
+            "    \"\"\"\n",
+            "    Pack n circles in a unit square to maximize sum of radii.\n",
+            "    \n",
+            "    Args:\n",
+            "        n: Number of circles to pack\n",
+            "\n",
+            "    Returns:\n",
+            "        Numpy array of shape (n, 3) where each row is (x, y, radius)\n",
+            "        All values should be between 0 and 1\n",
+            "        Circles must not overlap\n",
+            "        \n",
+            "    Important: Set \"all\" random seeds to 2025, including the packages (such as scipy sub-packages) involving random seeds.\n",
+            "    \"\"\"\n",
+            "    \n",
+            "    np.random.seed(2025)\n",
+            "    radius = 0.05  # Set a fixed radius for simplicity\n",
+            "    circles = []\n",
+            "    \n",
+            "    for _ in range(n):\n",
+            "        while True:\n",
+            "            x = np.random.uniform(radius, 1 - radius)\n",
+            "            y = np.random.uniform(radius, 1 - radius)\n",
+            "            # Check for overlap\n",
+            "            overlap = False\n",
+            "            for circle in circles:\n",
+            "                if ((x - circle[0]) ** 2 + (y - circle[1]) ** 2) < (2 * radius) ** 2:\n",
+            "                    overlap = True\n",
+            "                    break\n",
+            "            if not overlap:\n",
+            "                circles.append([x, y, radius])\n",
+            "                break\n",
+            "    \n",
+            "    return np.array(circles)\u001b[0m\n",
+            "[Step 1] \u001b[92mGEPA(base) best mean: 1.3000000000000003\u001b[0m\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "\rSampling training minibatch: Sampling 2 agents on 1 inputs:   0%|          | 0/2 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 4464.40it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 4359.98it/s]\n",
+            "\rBackward:   0%|          | 0/2 [00:00<?, ?it/s]\rBackward: 100%|██████████| 2/2 [00:00<00:00, 923.45it/s]\n",
+            "\rCalling optimizers: Generating 2 proposals for each of 2 batches:   0%|          | 0/4 [00:00<?, ?it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  25%|██▌       | 1/4 [00:02<00:07,  2.66s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  75%|███████▌  | 3/4 [00:03<00:00,  1.21it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:03<00:00,  1.59it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:03<00:00,  1.22it/s]\n",
+            "\rValidating newly proposed candidates: Sampling 4 agents on 1 inputs:   0%|          | 0/4 [00:00<?, ?it/s]\rValidating newly proposed candidates: Sampling 4 agents on 1 inputs: 100%|██████████| 4/4 [00:00<00:00, 4664.22it/s]\n",
+            "\rSampling training minibatch: Sampling 2 agents on 1 inputs:   0%|          | 0/2 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 2310.91it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 1673.70it/s]\n",
+            "\rGEPA(base): seed eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): seed eval: 100%|██████████| 1/1 [00:00<00:00, 4284.27it/s]\n",
+            "\rGEPA(base): child eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): child eval: 100%|██████████| 1/1 [00:00<00:00, 5133.79it/s]\n",
+            "\rSampling training minibatch: Sampling 2 agents on 1 inputs:   0%|          | 0/2 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 81.58it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 75.37it/s]\n",
+            "\rBackward:   0%|          | 0/2 [00:00<?, ?it/s]\rBackward: 100%|██████████| 2/2 [00:00<00:00, 1648.38it/s]\n",
+            "\rCalling optimizers: Generating 2 proposals for each of 2 batches:   0%|          | 0/4 [00:00<?, ?it/s]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  25%|██▌       | 1/4 [00:07<00:23,  7.88s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches:  50%|█████     | 2/4 [00:08<00:06,  3.36s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:08<00:00,  1.35s/it]\rCalling optimizers: Generating 2 proposals for each of 2 batches: 100%|██████████| 4/4 [00:08<00:00,  2.10s/it]\n",
+            "\rValidating newly proposed candidates: Sampling 4 agents on 1 inputs:   0%|          | 0/4 [00:00<?, ?it/s]\rValidating newly proposed candidates: Sampling 4 agents on 1 inputs: 100%|██████████| 4/4 [00:00<00:00, 52.12it/s]\n",
+            "\rSampling training minibatch: Sampling 2 agents on 1 inputs:   0%|          | 0/2 [00:00<?, ?it/s]\rSampling training minibatch: Sampling 2 agents on 1 inputs: 100%|██████████| 2/2 [00:00<00:00, 48.71it/s]\n",
+            "\rEvaluating agent:   0%|          | 0/1 [00:00<?, ?it/s]\rEvaluating agent: 100%|██████████| 1/1 [00:00<00:00, 44.86it/s]\n",
+            "\rGEPA(base): seed eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): seed eval: 100%|██████████| 1/1 [00:00<00:00, 34.40it/s]\n",
+            "\rGEPA(base): child eval:   0%|          | 0/1 [00:00<?, ?it/s]\rGEPA(base): child eval: 100%|██████████| 1/1 [00:00<00:00, 33.37it/s]\n"
+          ]
+        }
+      ],
+      "source": [
+        "%%bash\n",
+        "set -euo pipefail\n",
+        "cd /content/Trace-Bench\n",
+        "\n",
+        "echo \"=== 2x2 Matrix Smoke (mode=$TB_MODE) ===\"\n",
+        "\n",
+        "cat > /content/m1_matrix.yaml <<YAML\n",
+        "runs_dir: runs\n",
+        "mode: $TB_MODE\n",
+        "seeds: [123]\n",
+        "max_workers: 1\n",
+        "fail_fast: false\n",
+        "\n",
+        "tasks:\n",
+        "  - id: internal:numeric_param\n",
+        "  - id: llm4ad:circle_packing\n",
+        "    eval_kwargs:\n",
+        "      timeout_seconds: 10\n",
+        "\n",
+        "trainers:\n",
+        "  - id: PrioritySearch\n",
+        "    params_variants:\n",
+        "      - ps_steps: 1\n",
+        "        ps_batches: 1\n",
+        "\n",
+        "  - id: GEPA-Base\n",
+        "    params_variants:\n",
+        "      - gepa_iters: 1\n",
+        "        gepa_train_bs: 2\n",
+        "        gepa_merge_every: 2\n",
+        "        gepa_pareto_subset: 2\n",
+        "YAML\n",
+        "\n",
+        "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench run --config /content/m1_matrix.yaml --runs-dir \"$RUNS_DIR\""
+      ],
+      "id": "dMn7PDVgm0UX"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "W18tGXfYm0UZ",
+        "outputId": "f26ccee7-5877-4ee9-a0a2-7b3d2257fbc5",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 279
+        }
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Matrix run dir: /content/drive/MyDrive/bench/2026-02-11/trace_bench/20260211-131659-85037210\n",
+            "\n",
+            "results.csv rows: 4  (expected: 4)\n",
+            "summary.json: {'counts': {'ok': 4, 'failed': 0, 'skipped': 0}, 'total_jobs': 4}\n",
+            "\n",
+            "--- Matrix results ---\n"
+          ]
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "                  task_id     suite      trainer_id  seed status  score_best\n",
+              "0  internal:numeric_param  internal  PrioritySearch   123     ok        -0.0\n",
+              "1  internal:numeric_param  internal       GEPA-Base   123     ok        -0.0\n",
+              "2   llm4ad:circle_packing    llm4ad  PrioritySearch   123     ok         1.3\n",
+              "3   llm4ad:circle_packing    llm4ad       GEPA-Base   123     ok         1.3"
+            ],
+            "text/html": [
+              "\n",
+              "  <div id=\"df-b9a3e999-df23-4bfb-8d6a-0a3a33997dae\" class=\"colab-df-container\">\n",
+              "    <div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>task_id</th>\n",
+              "      <th>suite</th>\n",
+              "      <th>trainer_id</th>\n",
+              "      <th>seed</th>\n",
+              "      <th>status</th>\n",
+              "      <th>score_best</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>internal:numeric_param</td>\n",
+              "      <td>internal</td>\n",
+              "      <td>PrioritySearch</td>\n",
+              "      <td>123</td>\n",
+              "      <td>ok</td>\n",
+              "      <td>-0.0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>internal:numeric_param</td>\n",
+              "      <td>internal</td>\n",
+              "      <td>GEPA-Base</td>\n",
+              "      <td>123</td>\n",
+              "      <td>ok</td>\n",
+              "      <td>-0.0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>llm4ad:circle_packing</td>\n",
+              "      <td>llm4ad</td>\n",
+              "      <td>PrioritySearch</td>\n",
+              "      <td>123</td>\n",
+              "      <td>ok</td>\n",
+              "      <td>1.3</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>llm4ad:circle_packing</td>\n",
+              "      <td>llm4ad</td>\n",
+              "      <td>GEPA-Base</td>\n",
+              "      <td>123</td>\n",
+              "      <td>ok</td>\n",
+              "      <td>1.3</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>\n",
+              "    <div class=\"colab-df-buttons\">\n",
+              "\n",
+              "  <div class=\"colab-df-container\">\n",
+              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b9a3e999-df23-4bfb-8d6a-0a3a33997dae')\"\n",
+              "            title=\"Convert this dataframe to an interactive table.\"\n",
+              "            style=\"display:none;\">\n",
+              "\n",
+              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
+              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
+              "  </svg>\n",
+              "    </button>\n",
+              "\n",
+              "  <style>\n",
+              "    .colab-df-container {\n",
+              "      display:flex;\n",
+              "      gap: 12px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert {\n",
+              "      background-color: #E8F0FE;\n",
+              "      border: none;\n",
+              "      border-radius: 50%;\n",
+              "      cursor: pointer;\n",
+              "      display: none;\n",
+              "      fill: #1967D2;\n",
+              "      height: 32px;\n",
+              "      padding: 0 0 0 0;\n",
+              "      width: 32px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert:hover {\n",
+              "      background-color: #E2EBFA;\n",
+              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "      fill: #174EA6;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-buttons div {\n",
+              "      margin-bottom: 4px;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert {\n",
+              "      background-color: #3B4455;\n",
+              "      fill: #D2E3FC;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert:hover {\n",
+              "      background-color: #434B5C;\n",
+              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
+              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
+              "      fill: #FFFFFF;\n",
+              "    }\n",
+              "  </style>\n",
+              "\n",
+              "    <script>\n",
+              "      const buttonEl =\n",
+              "        document.querySelector('#df-b9a3e999-df23-4bfb-8d6a-0a3a33997dae button.colab-df-convert');\n",
+              "      buttonEl.style.display =\n",
+              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "\n",
+              "      async function convertToInteractive(key) {\n",
+              "        const element = document.querySelector('#df-b9a3e999-df23-4bfb-8d6a-0a3a33997dae');\n",
+              "        const dataTable =\n",
+              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
+              "                                                    [key], {});\n",
+              "        if (!dataTable) return;\n",
+              "\n",
+              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
+              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
+              "          + ' to learn more about interactive tables.';\n",
+              "        element.innerHTML = '';\n",
+              "        dataTable['output_type'] = 'display_data';\n",
+              "        await google.colab.output.renderOutput(dataTable, element);\n",
+              "        const docLink = document.createElement('div');\n",
+              "        docLink.innerHTML = docLinkHtml;\n",
+              "        element.appendChild(docLink);\n",
+              "      }\n",
+              "    </script>\n",
+              "  </div>\n",
+              "\n",
+              "\n",
+              "    </div>\n",
+              "  </div>\n"
+            ],
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "type": "dataframe",
+              "summary": "{\n  \"name\": \"df[[\\\"task_id\\\", \\\"suite\\\", \\\"trainer_id\\\", \\\"seed\\\", \\\"status\\\", \\\"score_best\\\"]]\",\n  \"rows\": 4,\n  \"fields\": [\n    {\n      \"column\": \"task_id\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          \"llm4ad:circle_packing\",\n          \"internal:numeric_param\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"suite\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          \"llm4ad\",\n          \"internal\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"trainer_id\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          \"GEPA-Base\",\n          \"PrioritySearch\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"seed\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 123,\n        \"max\": 123,\n        \"num_unique_values\": 1,\n        \"samples\": [\n          123\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"status\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 1,\n        \"samples\": [\n          \"ok\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"score_best\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.7505553499465139,\n        \"min\": -0.0,\n        \"max\": 1.3000000000000005,\n        \"num_unique_values\": 2,\n        \"samples\": [\n          1.3000000000000005\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
+            }
+          },
+          "metadata": {},
+          "execution_count": 6
+        }
+      ],
+      "source": [
+        "# Verify 2x2 matrix: exactly 4 rows in results.csv\n",
+        "import json, pathlib, pandas as pd\n",
+        "\n",
+        "runs_root = pathlib.Path(RUNS_DIR)\n",
+        "candidates = sorted([p for p in runs_root.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)\n",
+        "\n",
+        "matrix_dir = None\n",
+        "for p in reversed(candidates):\n",
+        "    summary_path = p / \"summary.json\"\n",
+        "    if not summary_path.exists():\n",
+        "        continue\n",
+        "    try:\n",
+        "        summary = json.loads(summary_path.read_text())\n",
+        "    except Exception:\n",
+        "        continue\n",
+        "    if summary.get(\"total_jobs\") == 4:\n",
+        "        matrix_dir = p\n",
+        "        break\n",
+        "\n",
+        "if matrix_dir is None:\n",
+        "    raise FileNotFoundError(\"No matrix run with total_jobs==4 found. Re-run the matrix cell.\")\n",
+        "\n",
+        "print(\"Matrix run dir:\", matrix_dir)\n",
+        "\n",
+        "df = pd.read_csv(matrix_dir / \"results.csv\")\n",
+        "print(f\"\\nresults.csv rows: {len(df)}  (expected: 4)\")\n",
+        "assert len(df) == 4, f\"Expected 4 rows, got {len(df)}\"\n",
+        "\n",
+        "summary = json.loads((matrix_dir / \"summary.json\").read_text())\n",
+        "print(f\"summary.json: {summary}\")\n",
+        "assert summary.get(\"total_jobs\") == 4\n",
+        "\n",
+        "print(\"\\n--- Matrix results ---\")\n",
+        "df[[\"task_id\", \"suite\", \"trainer_id\", \"seed\", \"status\", \"score_best\"]]\n"
+      ],
+      "id": "W18tGXfYm0UZ"
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.10"
+    },
+    "colab": {
+      "provenance": []
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/notebooks/01_smoke_runner.ipynb b/notebooks/01_smoke_runner.ipynb
new file mode 100644
index 0000000..283fb83
--- /dev/null
+++ b/notebooks/01_smoke_runner.ipynb
@@ -0,0 +1,213 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Trace-Bench Smoke Runner (Stub + Real)\n",
+        "\n",
+        "This notebook validates Trace-Bench in two modes:\n",
+        "\n",
+        "- **StubLLM**: deterministic, no API keys\n",
+        "- **Real LLM**: requires a user-provided API key (via Colab Secrets)\n",
+        "\n",
+        "It also shows the standardized run artifacts produced by the CLI."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Expected Outputs (Quick Verification)\n",
+        "\n",
+        "You should see the following signals if the notebook is working correctly:\n",
+        "\n",
+        "- **Stub smoke run** completes with a new `runs/<run_id>/` folder.\n",
+        "- `config.snapshot.yaml`, `env.json`, `results.csv`, `events.jsonl` exist in that folder.\n",
+        "- `results.csv` shows at least one row with `task=example:greeting_stub` and `status=trained`.\n",
+        "- **Real-LLM smoke** completes (if API key is set) and `results.csv` shows `status=trained`.\n",
+        "- `pytest -q` ends with `passed` (LLM4AD optimizer tests run only when `OPENAI_API_KEY` is set)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Mount Drive (optional) + compute persistent runs_dir\n",
+        "from datetime import date\n",
+        "from pathlib import Path\n",
+        "import os\n",
+        "\n",
+        "try:\n",
+        "    from google.colab import drive\n",
+        "    drive.mount(\"/content/drive\")\n",
+        "except Exception:\n",
+        "    pass\n",
+        "\n",
+        "\n",
+        "def bench_dir(project=\"bench\", sub=\"trace_bench\", local=\"/content/bench\"):\n",
+        "    drive = Path(\"/content/drive/MyDrive\")\n",
+        "    root = drive if drive.is_dir() else Path(local)\n",
+        "    out = root / project / date.today().isoformat() / sub\n",
+        "    out.mkdir(parents=True, exist_ok=True)\n",
+        "    return str(out)\n",
+        "\n",
+        "RUNS_DIR = bench_dir()\n",
+        "os.environ[\"RUNS_DIR\"] = RUNS_DIR\n",
+        "print(\"Runs dir:\", RUNS_DIR)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Clone repos side-by-side (Trace-Bench + OpenTrace)\n",
+        "!git clone --depth 1 --branch runner-foundation https://github.com/guru-code-expert/Trace-Bench.git\n",
+        "!git clone --depth 1 --branch experimental https://github.com/guru-code-expert/OpenTrace.git\n",
+        "\n",
+        "%cd Trace-Bench\n",
+        "\n",
+        "# System + Python deps\n",
+        "!apt-get update -y && apt-get install -y graphviz\n",
+        "!python -m pip install -U pip\n",
+        "!python -m pip install pyyaml pytest numpy matplotlib graphviz litellm==1.75.0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Optional: list tasks (external bench discovery)\n",
+        "!python -m trace_bench list-tasks --root LLM4AD/benchmark_tasks | head -n 30"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%%bash\n",
+        "cd /content/Trace-Bench\n",
+        "\n",
+        "# Stub smoke (internal example task for deterministic output)\n",
+        "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench run --config configs/smoke.yaml --runs-dir \"$RUNS_DIR\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Inspect latest run artifacts\n",
+        "import glob, json, pathlib, pandas as pd\n",
+        "\n",
+        "latest = sorted(glob.glob(f\"{RUNS_DIR}/*\"))[-1]\n",
+        "p = pathlib.Path(latest)\n",
+        "print(p)\n",
+        "\n",
+        "print((p / \"config.snapshot.yaml\").read_text()[:400])\n",
+        "print(json.loads((p / \"env.json\").read_text()).keys())\n",
+        "\n",
+        "pd.read_csv(p / \"results.csv\").head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%%bash\n",
+        "cd /content/Trace-Bench\n",
+        "\n",
+        "# Optional: external LLM4AD smoke (may yield low score if template fails)\n",
+        "cat > configs/smoke_llm4ad.yaml <<'YAML'\n",
+        "runs_dir: runs\n",
+        "mode: stub\n",
+        "seed: 123\n",
+        "tasks:\n",
+        "  - circle_packing\n",
+        "trainers:\n",
+        "  - PrioritySearch\n",
+        "YAML\n",
+        "\n",
+        "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench run --config configs/smoke_llm4ad.yaml --runs-dir \"$RUNS_DIR\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Real LLM (requires API key)\n",
+        "\n",
+        "Add `OPENAI_API_KEY` in **Colab Secrets** and run the cells below."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Load API key from Colab Secrets\n",
+        "from google.colab import userdata\n",
+        "import os\n",
+        "\n",
+        "key = userdata.get(\"OPENAI_API_KEY\")\n",
+        "if not key:\n",
+        "    raise RuntimeError(\"Missing OPENAI_API_KEY secret in Colab\")\n",
+        "\n",
+        "os.environ[\"OPENAI_API_KEY\"] = key\n",
+        "os.environ[\"TRACE_DEFAULT_LLM_BACKEND\"] = \"LiteLLM\"\n",
+        "os.environ[\"TRACE_LITELLM_MODEL\"] = \"gpt-4o-mini\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%%bash\n",
+        "cd /content/Trace-Bench\n",
+        "\n",
+        "# Real-LLM smoke (internal example task)\n",
+        "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m trace_bench run --config configs/smoke_real.yaml --runs-dir \"$RUNS_DIR\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%%bash\n",
+        "cd /content/Trace-Bench\n",
+        "\n",
+        "# Pytest (LLM4AD optimizer test runs only if OPENAI_API_KEY is set)\n",
+        "PYTHONPATH=/content/OpenTrace:$PYTHONPATH python -m pytest -q"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..be74aa6
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+testpaths = tests
+pythonpath = .
+addopts = -p no:langsmith
diff --git a/setup.py b/setup.py
index 30f3fdb..c879a60 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,4 @@
-import os
+﻿import os
 import setuptools
 
 here = os.path.abspath(os.path.dirname(__file__))
@@ -15,9 +15,10 @@
     "black",
     "scikit-learn",
     "tensorboardX",
-    "tensorboard"
+    "tensorboard",
+    "pyyaml",
 ]
-    
+
 setuptools.setup(
     name="trace-bench",
     version=__version__,
@@ -27,7 +28,12 @@
     license='MIT LICENSE',
     description="An AutoDiff-like tool for training AI systems end-to-end with general feedback",
     long_description=open('README.md', encoding="utf8").read(),
-    packages=setuptools.find_packages(include=["opto*"]),
+    packages=setuptools.find_packages(include=["trace_bench*", "opto*"]),
     install_requires=install_requires,
     python_requires=">=3.9",
+    entry_points={
+        "console_scripts": [
+            "trace-bench=trace_bench.cli:main",
+        ]
+    },
 )
diff --git a/tests/m0/test_config.py b/tests/m0/test_config.py
new file mode 100644
index 0000000..71fabf3
--- /dev/null
+++ b/tests/m0/test_config.py
@@ -0,0 +1,8 @@
+﻿from trace_bench.config import load_config
+
+
+def test_load_config_smoke():
+    cfg = load_config("configs/smoke.yaml")
+    assert cfg.mode == "stub"
+    assert cfg.tasks[0].id == "internal:numeric_param"
+    assert cfg.runs_dir == "runs"
diff --git a/tests/m0/test_runner_smoke.py b/tests/m0/test_runner_smoke.py
new file mode 100644
index 0000000..d43891b
--- /dev/null
+++ b/tests/m0/test_runner_smoke.py
@@ -0,0 +1,38 @@
+import csv
+import os
+from pathlib import Path
+
+import pytest
+
+from trace_bench.config import load_config
+from trace_bench.runner import BenchRunner
+
+
+def test_runner_smoke(tmp_path):
+    try:
+        import graphviz  # noqa: F401
+    except Exception as exc:  # pragma: no cover - dependency check
+        pytest.fail(f"graphviz is required for smoke: {exc}")
+    repo_root = Path(__file__).resolve().parents[2]
+    os.chdir(repo_root)
+
+    cfg = load_config("configs/smoke.yaml")
+    cfg.runs_dir = str(tmp_path / "runs")
+
+    runner = BenchRunner(cfg)
+    summary = runner.run()
+
+    assert summary.results
+    run_dir = Path(cfg.runs_dir) / summary.run_id
+    assert run_dir.exists()
+    assert (run_dir / "meta" / "config.snapshot.yaml").exists()
+    assert (run_dir / "meta" / "env.json").exists()
+    assert (run_dir / "meta" / "manifest.json").exists()
+    assert (run_dir / "results.csv").exists()
+    assert (run_dir / "summary.json").exists()
+
+    with (run_dir / "results.csv").open("r", encoding="utf-8") as f:
+        rows = list(csv.DictReader(f))
+    assert rows
+    assert "job_id" in rows[0]
+    assert any(row.get("status") != "skipped" for row in rows)
diff --git a/tests/m0/test_stub_llm.py b/tests/m0/test_stub_llm.py
new file mode 100644
index 0000000..5d6cc21
--- /dev/null
+++ b/tests/m0/test_stub_llm.py
@@ -0,0 +1,25 @@
+﻿import pytest
+
+from trace_bench.registry import load_task_bundle
+
+
+def _skip_if_missing_deps(exc: Exception):
+    msg = str(exc).lower()
+    if "graphviz" in msg or "opto" in msg:
+        pytest.skip(f"Optional dependency missing: {exc}")
+
+
+def test_example_tasks_load():
+    try:
+        bundle = load_task_bundle("trace_examples:greeting_stub", "LLM4AD/benchmark_tasks")
+    except Exception as exc:
+        _skip_if_missing_deps(exc)
+        raise
+    assert {"param", "guide", "train_dataset", "optimizer_kwargs", "metadata"}.issubset(bundle.keys())
+
+    try:
+        bundle2 = load_task_bundle("trace_examples:train_single_node_stub", "LLM4AD/benchmark_tasks")
+    except Exception as exc:
+        _skip_if_missing_deps(exc)
+        raise
+    assert {"param", "guide", "train_dataset", "optimizer_kwargs", "metadata"}.issubset(bundle2.keys())
diff --git a/tests/m1/test_artifact_serialization.py b/tests/m1/test_artifact_serialization.py
new file mode 100644
index 0000000..e52daa5
--- /dev/null
+++ b/tests/m1/test_artifact_serialization.py
@@ -0,0 +1,57 @@
+import csv
+import json
+from pathlib import Path
+
+from trace_bench.config import RunConfig
+from trace_bench.runner import BenchRunner
+
+
+def _run_stub(tmp_path: Path) -> Path:
+    cfg = RunConfig.from_dict(
+        {
+            "mode": "stub",
+            "seeds": [123],
+            "tasks": [{"id": "internal:numeric_param"}],
+            "trainers": [{"id": "PrioritySearch", "params_variants": [{"threads": 2}]}],
+        }
+    )
+    cfg.runs_dir = str(tmp_path / "runs")
+    summary = BenchRunner(cfg).run()
+    return Path(cfg.runs_dir) / summary.run_id
+
+
+def test_no_memory_addresses_in_artifacts(tmp_path):
+    run_dir = _run_stub(tmp_path)
+    for path in run_dir.rglob("*"):
+        if not path.is_file():
+            continue
+        if path.suffix not in {".json", ".jsonl", ".csv"}:
+            continue
+        text = path.read_text(encoding="utf-8")
+        assert "object at 0x" not in text
+
+
+def test_structured_nested_fields_in_outputs(tmp_path):
+    run_dir = _run_stub(tmp_path)
+    job_dir = next((run_dir / "jobs").iterdir())
+
+    meta = json.loads((job_dir / "job_meta.json").read_text(encoding="utf-8"))
+    assert isinstance(meta["resolved_optimizer_kwargs"], dict)
+    assert isinstance(meta["resolved_trainer_kwargs"], dict)
+
+    results = json.loads((job_dir / "results.json").read_text(encoding="utf-8"))
+    assert isinstance(results["resolved_optimizer_kwargs"], dict)
+    assert isinstance(results["resolved_trainer_kwargs"], dict)
+
+    event_lines = (job_dir / "events.jsonl").read_text(encoding="utf-8").strip().splitlines()
+    assert event_lines
+    event = json.loads(event_lines[0])
+    assert isinstance(event["resolved_optimizer_kwargs"], dict)
+    assert isinstance(event["resolved_trainer_kwargs"], dict)
+
+    with (run_dir / "results.csv").open("r", encoding="utf-8") as handle:
+        rows = list(csv.DictReader(handle))
+    assert rows
+    parsed = json.loads(rows[0]["resolved_optimizer_kwargs"])
+    assert isinstance(parsed, dict)
+
diff --git a/tests/m1/test_artifacts_layout.py b/tests/m1/test_artifacts_layout.py
new file mode 100644
index 0000000..618607e
--- /dev/null
+++ b/tests/m1/test_artifacts_layout.py
@@ -0,0 +1,28 @@
+from pathlib import Path
+
+from trace_bench.config import load_config
+from trace_bench.runner import BenchRunner
+
+
+def test_artifacts_layout(tmp_path):
+    cfg = load_config("configs/smoke.yaml")
+    cfg.runs_dir = str(tmp_path / "runs")
+
+    summary = BenchRunner(cfg).run()
+    run_dir = Path(cfg.runs_dir) / summary.run_id
+
+    assert (run_dir / "meta" / "config.snapshot.yaml").exists()
+    assert (run_dir / "meta" / "env.json").exists()
+    assert (run_dir / "meta" / "git.json").exists()
+    assert (run_dir / "meta" / "manifest.json").exists()
+    assert (run_dir / "results.csv").exists()
+    assert (run_dir / "summary.json").exists()
+
+    jobs_dir = run_dir / "jobs"
+    job_dirs = [p for p in jobs_dir.iterdir() if p.is_dir()]
+    assert job_dirs, "expected at least one job directory"
+    job_dir = job_dirs[0]
+    assert (job_dir / "job_meta.json").exists()
+    assert (job_dir / "results.json").exists()
+    assert (job_dir / "events.jsonl").exists()
+    assert (job_dir / "tb").exists()
diff --git a/tests/m1/test_internal_tasks.py b/tests/m1/test_internal_tasks.py
new file mode 100644
index 0000000..ac5b674
--- /dev/null
+++ b/tests/m1/test_internal_tasks.py
@@ -0,0 +1,23 @@
+from trace_bench.config import RunConfig
+from trace_bench.registry import load_task_bundle
+from trace_bench.runner import BenchRunner
+
+
+def test_internal_tasks_load():
+    bundle = load_task_bundle("internal:code_param", "LLM4AD/benchmark_tasks")
+    assert "param" in bundle
+    bundle2 = load_task_bundle("internal:numeric_param", "LLM4AD/benchmark_tasks")
+    assert "param" in bundle2
+
+
+def test_internal_non_trainable_fails(tmp_path):
+    cfg = RunConfig.from_dict(
+        {
+            "tasks": [{"id": "internal:non_trainable"}],
+            "trainers": [{"id": "PrioritySearch", "params_variants": [{"ps_steps": 1}]}],
+            "seeds": [123],
+        }
+    )
+    cfg.runs_dir = str(tmp_path / "runs")
+    summary = BenchRunner(cfg).run()
+    assert any(row.get("status") == "failed" for row in summary.results)
diff --git a/tests/m1/test_manifest_truth.py b/tests/m1/test_manifest_truth.py
new file mode 100644
index 0000000..33109f8
--- /dev/null
+++ b/tests/m1/test_manifest_truth.py
@@ -0,0 +1,42 @@
+import json
+from pathlib import Path
+
+from trace_bench.config import RunConfig
+from trace_bench.runner import BenchRunner
+
+
+def test_manifest_matches_job_meta(tmp_path):
+    cfg = RunConfig.from_dict(
+        {
+            "mode": "stub",
+            "seeds": [123],
+            "tasks": [
+                {"id": "internal:numeric_param"},
+                {"id": "trace_examples:greeting_stub"},
+            ],
+            "trainers": [
+                {"id": "PrioritySearch", "params_variants": [{"threads": 2}]},
+                {"id": "GEPA-Base", "params_variants": [{"gepa_iters": 1}]},
+            ],
+        }
+    )
+    cfg.runs_dir = str(tmp_path / "runs")
+
+    summary = BenchRunner(cfg).run()
+    run_dir = Path(cfg.runs_dir) / summary.run_id
+    manifest = json.loads((run_dir / "meta" / "manifest.json").read_text(encoding="utf-8"))
+
+    assert manifest["jobs"], "expected manifest jobs"
+    for entry in manifest["jobs"]:
+        if entry.get("status") == "not_executed":
+            continue
+        job_meta_path = run_dir / "jobs" / entry["job_id"] / "job_meta.json"
+        assert job_meta_path.exists()
+        job_meta = json.loads(job_meta_path.read_text(encoding="utf-8"))
+        assert entry["raw_params"] == job_meta["raw_params"]
+        assert entry["resolved_trainer_kwargs"] == job_meta["resolved_trainer_kwargs"]
+        assert entry["resolved_optimizer_kwargs"] == job_meta["resolved_optimizer_kwargs"]
+        assert entry["resolved_guide_kwargs"] == job_meta["resolved_guide_kwargs"]
+        assert entry["resolved_logger_kwargs"] == job_meta["resolved_logger_kwargs"]
+        assert entry["eval_kwargs"] == job_meta["eval_kwargs"]
+
diff --git a/tests/m1/test_matrix.py b/tests/m1/test_matrix.py
new file mode 100644
index 0000000..766b194
--- /dev/null
+++ b/tests/m1/test_matrix.py
@@ -0,0 +1,51 @@
+import csv
+import json
+from pathlib import Path
+
+from trace_bench.config import RunConfig, load_config
+from trace_bench.matrix import compute_job_id, expand_matrix
+from trace_bench.runner import BenchRunner
+
+
+def test_expand_matrix_counts():
+    cfg = RunConfig.from_dict(
+        {
+            "tasks": [{"id": "internal:numeric_param"}, {"id": "internal:code_param"}],
+            "trainers": [
+                {"id": "PrioritySearch", "params_variants": [{}]},
+                {"id": "GEPA-Base", "params_variants": [{}]},
+            ],
+            "seeds": [123],
+        }
+    )
+    jobs = expand_matrix(cfg)
+    assert len(jobs) == 4
+
+
+def test_job_id_stable():
+    job_id_1 = compute_job_id("internal:numeric_param", "PrioritySearch", {"ps_steps": 1}, 123)
+    job_id_2 = compute_job_id("internal:numeric_param", "PrioritySearch", {"ps_steps": 1}, 123)
+    assert job_id_1 == job_id_2
+
+
+def test_matrix_smoke_e2e(tmp_path):
+    """Run 2 tasks x 2 trainers x 1 seed = 4 jobs end-to-end and verify results."""
+    cfg = load_config("configs/m1_matrix_smoke.yaml")
+    cfg.runs_dir = str(tmp_path / "runs")
+    cfg.mode = "stub"
+
+    summary = BenchRunner(cfg).run()
+    run_dir = Path(cfg.runs_dir) / summary.run_id
+
+    # results.csv must have exactly 4 data rows
+    results_csv = run_dir / "results.csv"
+    assert results_csv.exists()
+    with open(results_csv) as f:
+        rows = list(csv.DictReader(f))
+    assert len(rows) == 4, f"Expected 4 rows in results.csv, got {len(rows)}"
+
+    # summary.json must aggregate 4 jobs
+    summary_json = run_dir / "summary.json"
+    assert summary_json.exists()
+    summary_data = json.loads(summary_json.read_text())
+    assert summary_data["total_jobs"] == 4
diff --git a/tests/m1/test_opentrace_examples_smoke.py b/tests/m1/test_opentrace_examples_smoke.py
new file mode 100644
index 0000000..f22b275
--- /dev/null
+++ b/tests/m1/test_opentrace_examples_smoke.py
@@ -0,0 +1,88 @@
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+
+EXAMPLE_ALLOWLIST = {
+    "autogen",
+    "datasets",
+    "dotenv",
+    "dspy",
+    "graphviz",
+    "textgrad",
+}
+
+
+def _open_trace_root() -> Path:
+    repo_root = Path(__file__).resolve().parents[2]
+    return repo_root.parent / "OpenTrace"
+
+
+def _example_files() -> list[Path]:
+    root = _open_trace_root() / "examples"
+    if not root.exists():
+        pytest.skip("OpenTrace examples directory not found")
+    return sorted([p for p in root.rglob("*.py") if p.is_file()])
+
+
+def _is_argparse_script(path: Path) -> bool:
+    try:
+        text = path.read_text(encoding="utf-8")
+    except Exception:
+        return False
+    return "argparse" in text or "ArgumentParser(" in text
+
+
+def _extract_missing_module(output: str) -> str | None:
+    match = re.search(r"No module named ['\"]([^'\"]+)['\"]", output)
+    if match:
+        return match.group(1)
+    return None
+
+
+def _run_smoke(path: Path):
+    env = dict(os.environ)
+    env["PYTHONPATH"] = str(_open_trace_root())
+
+    env["TRACE_BENCH_SMOKE"] = "1"
+
+    if _is_argparse_script(path):
+        cmd = [sys.executable, str(path), "--help"]
+    else:
+        cmd = [
+            sys.executable,
+            "-c",
+            f"import runpy; runpy.run_path(r'{path.as_posix()}', run_name='__not_main__')",
+        ]
+
+    try:
+        proc = subprocess.run(
+            cmd,
+            env=env,
+            capture_output=True,
+            text=True,
+            cwd=str(path.parent),
+            timeout=30,
+        )
+        return proc
+    except subprocess.TimeoutExpired:
+        raise AssertionError(f"Smoke timed out for {path}")
+
+
+@pytest.mark.parametrize("path", _example_files())
+def test_opentrace_examples_smoke(path: Path):
+    strict = os.environ.get("TRACE_BENCH_STRICT_EXAMPLES") == "1"
+    proc = _run_smoke(path)
+    if proc.returncode == 0:
+        return
+
+    output = (proc.stdout or "") + "\n" + (proc.stderr or "")
+    missing = _extract_missing_module(output)
+    if missing and missing in EXAMPLE_ALLOWLIST and not strict:
+        pytest.skip(f"Optional dependency missing for {path.name}: {missing}")
+
+    raise AssertionError(f"Smoke failed for {path}:\n{output}")
diff --git a/tests/m1/test_threads_mapping.py b/tests/m1/test_threads_mapping.py
new file mode 100644
index 0000000..7746ced
--- /dev/null
+++ b/tests/m1/test_threads_mapping.py
@@ -0,0 +1,32 @@
+import csv
+import json
+from pathlib import Path
+
+from trace_bench.config import RunConfig
+from trace_bench.runner import BenchRunner
+
+
+def test_threads_maps_to_num_threads(tmp_path):
+    cfg = RunConfig.from_dict(
+        {
+            "mode": "stub",
+            "seeds": [123],
+            "tasks": [{"id": "internal:numeric_param"}],
+            "trainers": [{"id": "PrioritySearch", "params_variants": [{"threads": 3}]}],
+        }
+    )
+    cfg.runs_dir = str(tmp_path / "runs")
+
+    summary = BenchRunner(cfg).run()
+    run_dir = Path(cfg.runs_dir) / summary.run_id
+
+    job_dirs = [p for p in (run_dir / "jobs").iterdir() if p.is_dir()]
+    assert job_dirs, "expected at least one job directory"
+    meta = json.loads((job_dirs[0] / "job_meta.json").read_text(encoding="utf-8"))
+    assert meta["resolved_trainer_kwargs"]["num_threads"] == 3
+
+    with (run_dir / "results.csv").open("r", encoding="utf-8") as f:
+        rows = list(csv.DictReader(f))
+    assert rows, "expected at least one results row"
+    resolved = json.loads(rows[0]["resolved_trainer_kwargs"])
+    assert resolved["num_threads"] == 3
diff --git a/tests/m1/test_trainer_config.py b/tests/m1/test_trainer_config.py
new file mode 100644
index 0000000..f766c74
--- /dev/null
+++ b/tests/m1/test_trainer_config.py
@@ -0,0 +1,22 @@
+import pytest
+
+from trace_bench.config import RunConfig
+
+
+def test_trainer_params_variants_parsed():
+    cfg = RunConfig.from_dict(
+        {
+            "trainers": [
+                {
+                    "id": "PrioritySearch",
+                    "params_variants": [{"ps_steps": 2}],
+                }
+            ]
+        }
+    )
+    assert cfg.trainers[0].params_variants[0]["ps_steps"] == 2
+
+
+def test_trainer_missing_id_raises():
+    with pytest.raises(ValueError):
+        RunConfig.from_dict({"trainers": [{"params_variants": [{}]}]})
diff --git a/tests/m1/test_validate_runs_dir.py b/tests/m1/test_validate_runs_dir.py
new file mode 100644
index 0000000..d881255
--- /dev/null
+++ b/tests/m1/test_validate_runs_dir.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+
+from trace_bench.cli import cmd_validate
+
+
+def test_validate_writes_manifest_to_runs_dir(tmp_path):
+    config_path = tmp_path / "validate.yaml"
+    config_path.write_text(
+        "\n".join(
+            [
+                "mode: stub",
+                "tasks:",
+                "  - id: internal:numeric_param",
+                "trainers:",
+                "  - id: PrioritySearch",
+                "    params_variants:",
+                "      - threads: 2",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    runs_dir = tmp_path / "colab_runs"
+    rc = cmd_validate(
+        str(config_path),
+        "LLM4AD/benchmark_tasks",
+        bench=None,
+        strict=True,
+        runs_dir=str(runs_dir),
+    )
+    assert rc == 0
+
+    run_dirs = [path for path in runs_dir.iterdir() if path.is_dir()]
+    assert run_dirs, "validate should create one run directory under --runs-dir"
+    manifest_path = run_dirs[0] / "meta" / "manifest.json"
+    assert manifest_path.exists()
+
diff --git a/tests/m1/test_veribench_cli.py b/tests/m1/test_veribench_cli.py
new file mode 100644
index 0000000..086326b
--- /dev/null
+++ b/tests/m1/test_veribench_cli.py
@@ -0,0 +1,15 @@
+from trace_bench.cli import cmd_list_tasks, cmd_validate
+
+
+def test_veribench_list_tasks_does_not_fail():
+    assert cmd_list_tasks("LLM4AD/benchmark_tasks", bench="veribench") == 0
+
+
+def test_veribench_validate_does_not_fail(tmp_path, capsys):
+    config_path = tmp_path / "veribench.yaml"
+    config_path.write_text(
+        "tasks:\n  - id: veribench:smoke_placeholder\n", encoding="utf-8"
+    )
+    assert cmd_validate(str(config_path), "LLM4AD/benchmark_tasks", bench="veribench") == 0
+    out = capsys.readouterr().out
+    assert "[SKIP]" in out
diff --git a/tests/test_lite_optimize_llm4ad.py b/tests/test_lite_optimize_llm4ad.py
index 39df40c..03994ca 100644
--- a/tests/test_lite_optimize_llm4ad.py
+++ b/tests/test_lite_optimize_llm4ad.py
@@ -90,6 +90,9 @@ def _get_param_value(param):
 
 @pytest.mark.parametrize("task", TASKS)
 def test_lite_optimize_llm4ad_task(task):
+    if not os.environ.get("OPENAI_API_KEY"):
+        pytest.skip("OPENAI_API_KEY not set; skipping LLM-backed optimizer test.")
+
     try:
         llm4ad_loader = _import_llm4ad_loader()
     except Exception as exc:
diff --git a/trace_bench/__init__.py b/trace_bench/__init__.py
new file mode 100644
index 0000000..5899023
--- /dev/null
+++ b/trace_bench/__init__.py
@@ -0,0 +1,6 @@
+﻿"""Trace-Bench runner package."""
+
+from .config import RunConfig, load_config
+from .runner import BenchRunner
+
+__all__ = ["RunConfig", "load_config", "BenchRunner"]
diff --git a/trace_bench/__main__.py b/trace_bench/__main__.py
new file mode 100644
index 0000000..6dbaea4
--- /dev/null
+++ b/trace_bench/__main__.py
@@ -0,0 +1,4 @@
+﻿from trace_bench.cli import main
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/trace_bench/artifacts.py b/trace_bench/artifacts.py
new file mode 100644
index 0000000..47566a3
--- /dev/null
+++ b/trace_bench/artifacts.py
@@ -0,0 +1,260 @@
+﻿from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import csv
+import json
+import os
+import re
+import subprocess
+from datetime import datetime
+import platform
+import sys
+
+
+@dataclass
+class RunArtifacts:
+    run_dir: Path
+    meta_dir: Path
+    jobs_dir: Path
+
+    @property
+    def config_snapshot(self) -> Path:
+        return self.meta_dir / "config.snapshot.yaml"
+
+    @property
+    def env_json(self) -> Path:
+        return self.meta_dir / "env.json"
+
+    @property
+    def git_json(self) -> Path:
+        return self.meta_dir / "git.json"
+
+    @property
+    def manifest_json(self) -> Path:
+        return self.meta_dir / "manifest.json"
+
+    @property
+    def results_csv(self) -> Path:
+        return self.run_dir / "results.csv"
+
+    @property
+    def summary_json(self) -> Path:
+        return self.run_dir / "summary.json"
+
+
+@dataclass
+class JobArtifacts:
+    job_dir: Path
+
+    @property
+    def job_meta(self) -> Path:
+        return self.job_dir / "job_meta.json"
+
+    @property
+    def results_json(self) -> Path:
+        return self.job_dir / "results.json"
+
+    @property
+    def events_jsonl(self) -> Path:
+        return self.job_dir / "events.jsonl"
+
+    @property
+    def artifacts_dir(self) -> Path:
+        return self.job_dir / "artifacts"
+
+    @property
+    def tb_dir(self) -> Path:
+        return self.job_dir / "tb"
+
+
+def init_run_dir(runs_dir: str, run_id: str) -> RunArtifacts:
+    run_path = Path(runs_dir) / run_id
+    meta_dir = run_path / "meta"
+    jobs_dir = run_path / "jobs"
+    meta_dir.mkdir(parents=True, exist_ok=True)
+    jobs_dir.mkdir(parents=True, exist_ok=True)
+    return RunArtifacts(run_dir=run_path, meta_dir=meta_dir, jobs_dir=jobs_dir)
+
+
+def init_job_dir(run_artifacts: RunArtifacts, job_id: str) -> JobArtifacts:
+    job_dir = run_artifacts.jobs_dir / job_id
+    job_dir.mkdir(parents=True, exist_ok=True)
+    (job_dir / "artifacts").mkdir(parents=True, exist_ok=True)
+    (job_dir / "tb").mkdir(parents=True, exist_ok=True)
+    return JobArtifacts(job_dir=job_dir)
+
+
+def _dump_yaml_or_json(data: Dict[str, Any]) -> str:
+    try:
+        import yaml  # type: ignore
+        return yaml.safe_dump(data, sort_keys=False)
+    except Exception:
+        return json.dumps(data, indent=2, sort_keys=False)
+
+
+def write_config_snapshot(path: Path, data: Dict[str, Any]) -> None:
+    path.write_text(_dump_yaml_or_json(data), encoding="utf-8")
+
+
+def _git_info() -> Dict[str, Any]:
+    info: Dict[str, Any] = {}
+    try:
+        root = Path(__file__).resolve().parents[1]
+        info["repo_root"] = str(root)
+        info["commit"] = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=root).decode().strip()
+        info["branch"] = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=root).decode().strip()
+        return info
+    except Exception:
+        return info
+
+
+_ENV_ALLOWLIST = {
+    "TRACE_DEFAULT_LLM_BACKEND",
+    "TRACE_LITELLM_MODEL",
+    "TRACE_CUSTOMLLM_MODEL",
+    "TRACE_CUSTOMLLM_URL",
+    "CUDA_VISIBLE_DEVICES",
+    "PYTHONPATH",
+}
+
+_ENV_PREFIX_ALLOWLIST = (
+    "TRACE_",
+    "OPENAI_",
+    "ANTHROPIC_",
+    "AZURE_",
+    "HF_",
+    "HUGGINGFACE_",
+)
+
+_SENSITIVE_TOKENS = ("KEY", "TOKEN", "SECRET", "PASSWORD")
+
+
+def _is_allowed_env_key(key: str) -> bool:
+    if key in _ENV_ALLOWLIST:
+        return True
+    return any(key.startswith(prefix) for prefix in _ENV_PREFIX_ALLOWLIST)
+
+
+def _redact_env_value(key: str, value: str) -> str:
+    if any(token in key.upper() for token in _SENSITIVE_TOKENS):
+        return "***REDACTED***"
+    return value
+
+
+def write_env_json(path: Path) -> None:
+    env: Dict[str, str] = {}
+    for key in sorted(os.environ.keys()):
+        if _is_allowed_env_key(key):
+            env[key] = _redact_env_value(key, os.environ.get(key, ""))
+    payload = {
+        "captured_at": datetime.utcnow().isoformat() + "Z",
+        "env": env,
+        "runtime": {
+            "python_version": sys.version.split()[0],
+            "platform": platform.platform(),
+        },
+    }
+    path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+
+
+def write_git_json(path: Path) -> None:
+    path.write_text(json.dumps(_git_info(), indent=2), encoding="utf-8")
+
+
+_OBJECT_REPR_PATTERN = re.compile(r"<([^>]+) object at 0x[0-9A-Fa-f]+>")
+_SENSITIVE_FIELD_TOKENS = ("KEY", "TOKEN", "SECRET", "PASSWORD")
+
+
+def _sanitize_string(value: str) -> str:
+    return _OBJECT_REPR_PATTERN.sub(r"<\1>", value)
+
+
+def sanitize_for_json(value: Any) -> Any:
+    if value is None or isinstance(value, (bool, int, float)):
+        return value
+    if isinstance(value, str):
+        return _sanitize_string(value)
+    if isinstance(value, Path):
+        return str(value)
+    if isinstance(value, dict):
+        sanitized: Dict[str, Any] = {}
+        for key, item in value.items():
+            key_str = str(key)
+            if any(token in key_str.upper() for token in _SENSITIVE_FIELD_TOKENS):
+                sanitized[key_str] = "***REDACTED***"
+            else:
+                sanitized[key_str] = sanitize_for_json(item)
+        return sanitized
+    if isinstance(value, (list, tuple, set)):
+        return [sanitize_for_json(item) for item in value]
+
+    metadata: Dict[str, Any] = {
+        "__class__": value.__class__.__name__,
+        "__module__": value.__class__.__module__,
+    }
+    for attr in ("model_name", "model", "provider", "backend", "name"):
+        try:
+            attr_value = getattr(value, attr)
+        except Exception:
+            continue
+        if attr_value is None:
+            continue
+        if isinstance(attr_value, (str, int, float, bool)):
+            metadata[attr] = sanitize_for_json(attr_value)
+        elif isinstance(attr_value, Path):
+            metadata[attr] = str(attr_value)
+    return metadata
+
+
+def _dump_json(payload: Dict[str, Any]) -> str:
+    return json.dumps(sanitize_for_json(payload), indent=2, ensure_ascii=False)
+
+
+def write_manifest(path: Path, manifest: Dict[str, Any]) -> None:
+    path.write_text(_dump_json(manifest), encoding="utf-8")
+
+
+def write_job_meta(path: Path, job_meta: Dict[str, Any]) -> None:
+    path.write_text(_dump_json(job_meta), encoding="utf-8")
+
+
+def write_job_results(path: Path, results: Dict[str, Any]) -> None:
+    path.write_text(_dump_json(results), encoding="utf-8")
+
+
+def append_results_csv(path: Path, fieldnames: List[str], row: Dict[str, Any]) -> None:
+    write_header = not path.exists()
+    with path.open("a", encoding="utf-8", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        if write_header:
+            writer.writeheader()
+        writer.writerow(row)
+
+
+def append_event(path: Path, event: Dict[str, Any]) -> None:
+    with path.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(sanitize_for_json(event), ensure_ascii=False) + "\n")
+
+
+def write_summary(path: Path, summary: Dict[str, Any]) -> None:
+    path.write_text(_dump_json(summary), encoding="utf-8")
+
+
+__all__ = [
+    "RunArtifacts",
+    "JobArtifacts",
+    "init_run_dir",
+    "init_job_dir",
+    "write_config_snapshot",
+    "write_env_json",
+    "write_git_json",
+    "write_manifest",
+    "write_job_meta",
+    "write_job_results",
+    "append_results_csv",
+    "append_event",
+    "write_summary",
+    "sanitize_for_json",
+]
diff --git a/trace_bench/cli.py b/trace_bench/cli.py
new file mode 100644
index 0000000..6340136
--- /dev/null
+++ b/trace_bench/cli.py
@@ -0,0 +1,319 @@
+﻿from __future__ import annotations
+
+import argparse
+import json
+from datetime import datetime
+from pathlib import Path
+import sys
+
+from trace_bench.config import load_config
+from trace_bench.matrix import compute_run_id, expand_matrix
+from trace_bench.registry import discover_tasks, discover_trainers, load_task_bundle
+from trace_bench.resolve import merge_kwargs, resolve_trainer_kwargs
+from trace_bench.runner import BenchRunner, _has_trainables
+from trace_bench.artifacts import init_run_dir, write_manifest
+from trace_bench.ui import launch_ui
+
+
+def cmd_list_tasks(root: str, bench: str | None = None) -> int:
+    specs = discover_tasks(root, bench=bench)
+    for spec in specs:
+        print(spec.id)
+    return 0
+
+
+def cmd_list_trainers(include_all: bool = False) -> int:
+    specs = discover_trainers()
+    for spec in specs:
+        if not include_all and not spec.available:
+            continue
+        status = "available" if spec.available else "unavailable"
+        print(f"{spec.id}\t{status}")
+    return 0
+
+
+def _task_in_bench(task_key: str, bench: str | None) -> bool:
+    if not bench:
+        return True
+    if ":" not in task_key:
+        task_key = f"llm4ad:{task_key}"
+    if "veribench" in bench and task_key.startswith("veribench:"):
+        return True
+    if "trace_examples" in bench and task_key.startswith("trace_examples:"):
+        return True
+    if "internal" in bench and task_key.startswith("internal:"):
+        return True
+    if "llm4ad" in bench and task_key.startswith("llm4ad:"):
+        return True
+    return False
+
+
+_ALLOWED_TRAINER_KWARGS = {
+    "threads",
+    "num_threads",
+    "num_epochs",
+    "num_steps",
+    "num_batches",
+    "num_candidates",
+    "num_proposals",
+    "num_iters",
+    "num_search_iterations",
+    "train_batch_size",
+    "merge_every",
+    "pareto_subset_size",
+    "ps_steps",
+    "ps_batches",
+    "ps_candidates",
+    "ps_proposals",
+    "ps_mem_update",
+    "gepa_iters",
+    "gepa_train_bs",
+    "gepa_merge_every",
+    "gepa_pareto_subset",
+    # LLM4AD pass-through knobs (merged into params_variants by config parser)
+    "optimizer_kwargs",
+    "eval_kwargs",
+}
+
+
+def _resolve_symbol(module_name: str, symbol: str) -> bool:
+    try:
+        module = __import__(module_name, fromlist=[symbol])
+        return hasattr(module, symbol)
+    except Exception:
+        return False
+
+
+def _validate_trainer_params(trainer, errors: list[str]) -> None:
+    for params in trainer.params_variants or [{}]:
+        for key in params.keys():
+            if key not in _ALLOWED_TRAINER_KWARGS:
+                errors.append(f"unknown trainer kwarg '{key}' for {trainer.id}")
+
+    if trainer.optimizer and not _resolve_symbol("opto.optimizers", trainer.optimizer):
+        errors.append(f"optimizer not found: {trainer.optimizer}")
+    if trainer.guide and not _resolve_symbol("opto.trainer.guide", trainer.guide):
+        errors.append(f"guide not found: {trainer.guide}")
+    if trainer.logger and not _resolve_symbol("opto.trainer.loggers", trainer.logger):
+        errors.append(f"logger not found: {trainer.logger}")
+
+
+def cmd_validate(
+    config_path: str,
+    root: str,
+    bench: str | None = None,
+    strict: bool = False,
+    runs_dir: str | None = None,
+) -> int:
+    cfg = load_config(config_path)
+    if runs_dir:
+        cfg.runs_dir = runs_dir
+    tasks_root = Path(root)
+    errors = 0
+    if bench:
+        discover_tasks(tasks_root, bench=bench)
+    trainers = discover_trainers()
+    trainer_ids = {t.id for t in trainers if t.available}
+    strict_errors: list[str] = []
+    for trainer in cfg.trainers:
+        if trainer.id not in trainer_ids:
+            errors += 1
+            print(f"[FAIL] trainer {trainer.id}: not available")
+        if strict:
+            _validate_trainer_params(trainer, strict_errors)
+    if strict_errors:
+        for msg in strict_errors:
+            print(f"[FAIL] {msg}")
+        errors += len(strict_errors)
+
+    bundle_cache: dict[str, dict | None] = {}
+
+    def _bundle_cache_key(task) -> str:
+        eval_sig = json.dumps(task.eval_kwargs or {}, sort_keys=True)
+        return f"{task.id}|{eval_sig}"
+
+    def _cache_bundle(task, bundle):
+        bundle_cache[_bundle_cache_key(task)] = bundle
+
+    def _get_cached_bundle(task):
+        key = _bundle_cache_key(task)
+        if key in bundle_cache:
+            return bundle_cache[key]
+        try:
+            bundle = load_task_bundle(task.id, tasks_root, eval_kwargs=task.eval_kwargs)
+            _cache_bundle(task, bundle)
+        except Exception:
+            bundle_cache[key] = None
+        return bundle_cache.get(key)
+
+    for task in cfg.tasks:
+        task_id = task.id
+        if not _task_in_bench(task_id, bench):
+            continue
+        try:
+            bundle = load_task_bundle(task_id, tasks_root, eval_kwargs=task.eval_kwargs)
+            _cache_bundle(task, bundle)
+            print(f"[OK] {task_id}")
+            if strict:
+                if not _has_trainables(bundle["param"]):
+                    if task_id == "internal:non_trainable":
+                        print(f"[EXPECTED] {task_id}: no_trainable_parameters")
+                    else:
+                        errors += 1
+                        print(f"[FAIL] {task_id}: no_trainable_parameters")
+        except NotImplementedError as exc:
+            print(f"[SKIP] {task_id}: {exc}")
+        except Exception as exc:
+            errors += 1
+            print(f"[FAIL] {task_id}: {exc}")
+
+    if strict:
+        jobs = expand_matrix(cfg)
+        if not jobs:
+            errors += 1
+            print("[FAIL] matrix: no jobs expanded")
+        else:
+            print(f"\n[OK] matrix: {len(jobs)} jobs expanded deterministically")
+            seen_trainers: set[str] = set()
+            seen_tasks: set[str] = set()
+            for job in jobs:
+                seen_trainers.add(job.trainer_id)
+                seen_tasks.add(job.task_id)
+                print(f"  job {job.job_id}: {job.task_id} x {job.trainer_id} (seed={job.seed})")
+            print(f"\n  tasks:    {sorted(seen_tasks)}")
+            print(f"  trainers: {sorted(seen_trainers)}")
+            run_id = compute_run_id(cfg.snapshot())
+            artifacts = init_run_dir(cfg.runs_dir, run_id)
+            manifest_jobs = []
+            for job in jobs:
+                bundle = _get_cached_bundle(job.task)
+                status_hint = "ok"
+                skip_reason = ""
+                if bundle is None:
+                    try:
+                        bundle = load_task_bundle(job.task_id, tasks_root, eval_kwargs=job.task.eval_kwargs)
+                        _cache_bundle(job.task, bundle)
+                    except NotImplementedError as exc:
+                        status_hint = "skipped"
+                        skip_reason = str(exc)
+                    except Exception as exc:
+                        status_hint = "failed"
+                        skip_reason = f"task_load_error: {exc}"
+
+                manifest_jobs.append(
+                    {
+                        "job_id": job.job_id,
+                        "task_id": job.task_id,
+                        "suite": job.suite,
+                        "trainer_id": job.trainer_id,
+                        "seed": job.seed,
+                        "raw_params": dict(job.params),
+                        "resolved_trainer_kwargs": resolve_trainer_kwargs(job.params, job.trainer_id),
+                        "resolved_optimizer_kwargs": merge_kwargs(
+                            (bundle or {}).get("optimizer_kwargs", {}),
+                            job.trainer.optimizer_kwargs or {},
+                        ),
+                        "resolved_guide_kwargs": merge_kwargs(
+                            (bundle or {}).get("guide_kwargs"),
+                            job.trainer.guide_kwargs or {},
+                        ),
+                        "resolved_logger_kwargs": merge_kwargs(
+                            (bundle or {}).get("logger_kwargs"),
+                            job.trainer.logger_kwargs or {},
+                        ),
+                        "eval_kwargs": dict(job.task.eval_kwargs or {}),
+                        "status_hint": status_hint,
+                        "skip_reason": skip_reason,
+                    }
+                )
+            manifest = {
+                "run_id": run_id,
+                "generated_at": datetime.utcnow().isoformat() + "Z",
+                "jobs": manifest_jobs,
+            }
+            write_manifest(artifacts.manifest_json, manifest)
+            print(f"[OK] manifest written: {artifacts.manifest_json}")
+    return 1 if errors else 0
+
+
+def cmd_run(
+    config_path: str,
+    root: str,
+    runs_dir: str | None = None,
+    max_workers: int | None = None,
+) -> int:
+    cfg = load_config(config_path)
+    if runs_dir:
+        cfg.runs_dir = runs_dir
+    if max_workers is not None:
+        cfg.max_workers = max_workers
+    runner = BenchRunner(cfg, tasks_root=root)
+    runner.run()
+    return 0
+
+
+def cmd_ui(runs_dir: str) -> int:
+    return launch_ui(runs_dir)
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(prog="trace-bench")
+    sub = parser.add_subparsers(dest="cmd", required=True)
+
+    list_p = sub.add_parser("list-tasks", help="List discoverable tasks")
+    list_p.add_argument("--root", default="LLM4AD/benchmark_tasks")
+    list_p.add_argument(
+        "--bench",
+        "--dataset-name",
+        dest="bench",
+        default=None,
+        help="Bench selection: llm4ad,trace_examples,internal,veribench",
+    )
+
+    list_t = sub.add_parser("list-trainers", help="List discoverable trainers")
+    list_t.add_argument("--all", action="store_true", help="Include unavailable trainers")
+
+    val_p = sub.add_parser("validate", help="Validate tasks in config")
+    val_p.add_argument("--config", required=True)
+    val_p.add_argument("--root", default="LLM4AD/benchmark_tasks")
+    val_p.add_argument(
+        "--bench",
+        "--dataset-name",
+        dest="bench",
+        default=None,
+        help="Bench selection: llm4ad,trace_examples,internal,veribench",
+    )
+    val_p.add_argument("--strict", action="store_true")
+    val_p.add_argument("--runs-dir", "--output-dir", dest="runs_dir", default=None)
+
+    run_p = sub.add_parser("run", help="Run a benchmark config")
+    run_p.add_argument("--config", required=True)
+    run_p.add_argument("--root", default="LLM4AD/benchmark_tasks")
+    run_p.add_argument("--runs-dir", "--output-dir", dest="runs_dir", default=None)
+    run_p.add_argument("--max-workers", "--n-concurrent", dest="max_workers", type=int, default=None)
+
+    ui_p = sub.add_parser("ui", help="Launch Gradio UI (stub)")
+    ui_p.add_argument("--runs-dir", default="runs")
+
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+
+    if args.cmd == "list-tasks":
+        return cmd_list_tasks(args.root, args.bench)
+    if args.cmd == "list-trainers":
+        return cmd_list_trainers(args.all)
+    if args.cmd == "validate":
+        return cmd_validate(args.config, args.root, args.bench, args.strict, args.runs_dir)
+    if args.cmd == "run":
+        return cmd_run(args.config, args.root, args.runs_dir, args.max_workers)
+    if args.cmd == "ui":
+        return cmd_ui(args.runs_dir)
+    return 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/trace_bench/config.py b/trace_bench/config.py
new file mode 100644
index 0000000..6d89237
--- /dev/null
+++ b/trace_bench/config.py
@@ -0,0 +1,233 @@
+﻿from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import json
+import uuid
+
+
+_LLM4AD_KNOBS = {
+    "threads",
+    "num_threads",
+    "optimizer_kwargs",
+    "eval_kwargs",
+    "ps_steps",
+    "ps_batches",
+    "ps_candidates",
+    "ps_proposals",
+    "ps_mem_update",
+    "gepa_iters",
+    "gepa_train_bs",
+    "gepa_merge_every",
+    "gepa_pareto_subset",
+}
+
+
+def _load_text(path: Path) -> str:
+    return path.read_text(encoding="utf-8")
+
+
+def _load_yaml_or_json(path: Path) -> Dict[str, Any]:
+    text = _load_text(path)
+    # Prefer YAML if available
+    try:
+        import yaml  # type: ignore
+        data = yaml.safe_load(text)
+        if data is None:
+            return {}
+        if not isinstance(data, dict):
+            raise ValueError("Config must be a mapping at top-level")
+        return data
+    except Exception:
+        # Fallback to JSON for environments without PyYAML
+        try:
+            data = json.loads(text)
+            if not isinstance(data, dict):
+                raise ValueError("Config must be a mapping at top-level")
+            return data
+        except json.JSONDecodeError as exc:
+            raise ValueError(
+                f"Failed to parse config {path}. Install PyYAML or use JSON syntax. Error: {exc}"
+            )
+
+
+def _as_dict(value: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+    return dict(value or {})
+
+
+def _normalize_key(key: str) -> str:
+    return key.replace("-", "_")
+
+
+def _extract_llm4ad_knobs(data: Dict[str, Any]) -> Dict[str, Any]:
+    knobs: Dict[str, Any] = {}
+    for raw_key, value in data.items():
+        key = _normalize_key(raw_key)
+        if key in _LLM4AD_KNOBS:
+            knobs[key] = value
+    return knobs
+
+
+@dataclass
+class TaskConfig:
+    id: str
+    eval_kwargs: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class TrainerConfig:
+    id: str
+    params_variants: List[Dict[str, Any]] = field(default_factory=list)
+    optimizer: Optional[str] = None
+    optimizer_kwargs: Dict[str, Any] = field(default_factory=dict)
+    guide: Optional[str] = None
+    guide_kwargs: Dict[str, Any] = field(default_factory=dict)
+    logger: Optional[str] = None
+    logger_kwargs: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class RunConfig:
+    run_id: Optional[str] = None
+    runs_dir: str = "runs"
+    mode: str = "stub"
+    seeds: List[int] = field(default_factory=lambda: [123])
+    max_workers: int = 1
+    fail_fast: bool = False
+    tasks: List[TaskConfig] = field(default_factory=list)
+    trainers: List[TrainerConfig] = field(default_factory=list)
+    eval_kwargs: Dict[str, Any] = field(default_factory=dict)
+    trainer_kwargs: Dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "RunConfig":
+        runs_dir = data.get("runs_dir", data.get("runs_root", "runs"))
+        mode = data.get("mode", "stub")
+        seeds = data.get("seeds")
+        if seeds is None:
+            seed = int(data.get("seed", 123))
+            seeds = [seed]
+        else:
+            seeds = [int(x) for x in (seeds or [])] or [123]
+
+        if "max_workers" in data:
+            max_workers = data.get("max_workers")
+        else:
+            max_workers = data.get("n_concurrent", data.get("n-concurrent", 1))
+        max_workers = int(max_workers)
+        fail_fast = bool(data.get("fail_fast", False))
+
+        default_eval = _as_dict(data.get("eval_kwargs"))
+        default_trainer_kwargs = _as_dict(data.get("trainer_kwargs"))
+        default_trainer_kwargs.update(_extract_llm4ad_knobs(data))
+
+        tasks: List[TaskConfig] = []
+        for item in list(data.get("tasks", []) or []):
+            if isinstance(item, str):
+                tasks.append(TaskConfig(id=item, eval_kwargs=dict(default_eval)))
+            elif isinstance(item, dict):
+                task_id = item.get("id") or item.get("key") or item.get("task")
+                if not task_id:
+                    raise ValueError(f"Task entry missing id: {item}")
+                eval_kwargs = dict(default_eval)
+                eval_kwargs.update(_as_dict(item.get("eval_kwargs")))
+                tasks.append(TaskConfig(id=str(task_id), eval_kwargs=eval_kwargs))
+            else:
+                raise ValueError(f"Unsupported task entry: {item}")
+
+        trainers: List[TrainerConfig] = []
+        for item in list(data.get("trainers", []) or []):
+            if isinstance(item, str):
+                params_variants = [dict(default_trainer_kwargs)]
+                trainers.append(TrainerConfig(id=item, params_variants=params_variants))
+                continue
+            if not isinstance(item, dict):
+                raise ValueError(f"Unsupported trainer entry: {item}")
+
+            trainer_id = item.get("id") or item.get("name") or item.get("trainer") or item.get("key")
+            if not trainer_id:
+                raise ValueError(f"Trainer entry missing id: {item}")
+
+            params_variants = item.get("params_variants")
+            if params_variants is None:
+                params = item.get("params") or item.get("trainer_kwargs") or {}
+                params_variants = [params]
+            normalized_variants: List[Dict[str, Any]] = []
+            for variant in list(params_variants or [{}]):
+                merged = dict(default_trainer_kwargs)
+                merged.update(_extract_llm4ad_knobs(item))
+                merged.update(dict(variant or {}))
+                normalized_variants.append(merged)
+
+            trainers.append(
+                TrainerConfig(
+                    id=str(trainer_id),
+                    params_variants=normalized_variants,
+                    optimizer=item.get("optimizer"),
+                    optimizer_kwargs=_as_dict(item.get("optimizer_kwargs")),
+                    guide=item.get("guide"),
+                    guide_kwargs=_as_dict(item.get("guide_kwargs")),
+                    logger=item.get("logger"),
+                    logger_kwargs=_as_dict(item.get("logger_kwargs")),
+                )
+            )
+
+        if not trainers:
+            trainers = [TrainerConfig(id="PrioritySearch", params_variants=[dict(default_trainer_kwargs)])]
+
+        return cls(
+            run_id=data.get("run_id"),
+            runs_dir=runs_dir,
+            mode=mode,
+            seeds=seeds,
+            max_workers=max_workers,
+            fail_fast=fail_fast,
+            tasks=tasks,
+            trainers=trainers,
+            eval_kwargs=default_eval,
+            trainer_kwargs=default_trainer_kwargs,
+        )
+
+    def ensure_run_id(self) -> str:
+        if not self.run_id:
+            self.run_id = str(uuid.uuid4())
+        return self.run_id
+
+    def snapshot(self) -> Dict[str, Any]:
+        return {
+            "run_id": self.run_id,
+            "runs_dir": self.runs_dir,
+            "mode": self.mode,
+            "seeds": list(self.seeds),
+            "max_workers": self.max_workers,
+            "fail_fast": self.fail_fast,
+            "tasks": [
+                {"id": task.id, "eval_kwargs": dict(task.eval_kwargs)}
+                for task in self.tasks
+            ],
+            "trainers": [
+                {
+                    "id": trainer.id,
+                    "params_variants": [dict(p) for p in trainer.params_variants],
+                    "optimizer": trainer.optimizer,
+                    "optimizer_kwargs": dict(trainer.optimizer_kwargs),
+                    "guide": trainer.guide,
+                    "guide_kwargs": dict(trainer.guide_kwargs),
+                    "logger": trainer.logger,
+                    "logger_kwargs": dict(trainer.logger_kwargs),
+                }
+                for trainer in self.trainers
+            ],
+            "eval_kwargs": dict(self.eval_kwargs),
+            "trainer_kwargs": dict(self.trainer_kwargs),
+        }
+
+
+def load_config(path: str | Path) -> RunConfig:
+    config_path = Path(path)
+    data = _load_yaml_or_json(config_path)
+    return RunConfig.from_dict(data)
+
+
+__all__ = ["RunConfig", "TaskConfig", "TrainerConfig", "load_config"]
diff --git a/trace_bench/examples/__init__.py b/trace_bench/examples/__init__.py
new file mode 100644
index 0000000..83e54f4
--- /dev/null
+++ b/trace_bench/examples/__init__.py
@@ -0,0 +1 @@
+﻿"""Example tasks for Trace-Bench."""
diff --git a/trace_bench/examples/greeting_stub.py b/trace_bench/examples/greeting_stub.py
new file mode 100644
index 0000000..9c119f8
--- /dev/null
+++ b/trace_bench/examples/greeting_stub.py
@@ -0,0 +1,49 @@
+﻿from __future__ import annotations
+
+from opto import trace
+from opto.trainer.guide import Guide
+
+
+class ExactMatchGuide(Guide):
+    def get_feedback(self, query: str, response: str, reference: str, **kwargs):
+        score = 1.0 if response == reference else 0.0
+        feedback = "Correct" if score == 1.0 else f"Expected: {reference}"
+        return score, feedback
+
+
+@trace.model
+class GreetingAgent:
+    def __init__(self):
+        self.greeting = trace.node("Hello", trainable=True)
+
+    def __call__(self, user_query: str):
+        name = user_query.split()[-1].strip("!.?")
+        return self.compose(self.greeting, name)
+
+    @trace.bundle(trainable=True)
+    def compose(self, greeting, name: str):
+        greeting_value = getattr(greeting, "data", greeting)
+        return f"{greeting_value}, {name}!"
+
+
+def build_trace_problem(**override_eval_kwargs):
+    agent = GreetingAgent()
+    guide = ExactMatchGuide()
+    train_dataset = dict(
+        inputs=["Hello I am Sam"],
+        infos=["Hello, Sam!"],
+    )
+    optimizer_kwargs = dict(
+        objective="Generate a correct greeting using the name from the query.",
+        memory_size=5,
+    )
+    return dict(
+        param=agent,
+        guide=guide,
+        train_dataset=train_dataset,
+        optimizer_kwargs=optimizer_kwargs,
+        metadata=dict(benchmark="example", entry="GreetingAgent"),
+    )
+
+
+__all__ = ["build_trace_problem", "GreetingAgent"]
diff --git a/trace_bench/examples/internal_code_param.py b/trace_bench/examples/internal_code_param.py
new file mode 100644
index 0000000..c9c78ce
--- /dev/null
+++ b/trace_bench/examples/internal_code_param.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+from opto import trace
+from opto.trainer.guide import Guide
+
+
+class CodeExactGuide(Guide):
+    def get_feedback(self, _query, response, reference, **_kwargs):
+        score = 1.0 if response == reference else 0.0
+        feedback = "Correct" if score == 1.0 else "Mismatch"
+        return score, feedback
+
+
+@trace.model
+class CodeParamAgent:
+    def __init__(self):
+        self.code = trace.node("def f(x): return x", trainable=True)
+
+    def __call__(self, _input):
+        return self.emit(self.code)
+
+    @trace.bundle(trainable=True)
+    def emit(self, code):
+        return code
+
+
+def build_trace_problem(**_override_eval_kwargs):
+    agent = CodeParamAgent()
+    guide = CodeExactGuide()
+    train_dataset = dict(inputs=[None], infos=["def f(x): return x"])
+    optimizer_kwargs = dict(objective="Match the target code exactly.", memory_size=5)
+    return dict(
+        param=agent,
+        guide=guide,
+        train_dataset=train_dataset,
+        optimizer_kwargs=optimizer_kwargs,
+        metadata=dict(benchmark="internal", entry="CodeParamAgent"),
+    )
+
+
+__all__ = ["build_trace_problem", "CodeParamAgent"]
diff --git a/trace_bench/examples/internal_multi_param.py b/trace_bench/examples/internal_multi_param.py
new file mode 100644
index 0000000..d598954
--- /dev/null
+++ b/trace_bench/examples/internal_multi_param.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from opto import trace
+from opto.trainer.guide import Guide
+
+
+class SumGuide(Guide):
+    def get_feedback(self, _query, response, reference, **_kwargs):
+        try:
+            score = -abs(float(response) - float(reference))
+        except Exception:
+            score = -1.0
+        feedback = f"target={reference}"
+        return score, feedback
+
+
+@trace.model
+class MultiParamAgent:
+    def __init__(self):
+        self.a = trace.node(1.0, trainable=True)
+        self.b = trace.node(1.0, trainable=True)
+
+    def __call__(self, _input):
+        return self.combine(self.a, self.b)
+
+    @trace.bundle(trainable=True)
+    def combine(self, a, b):
+        return float(getattr(a, "data", a)) + float(getattr(b, "data", b))
+
+
+def build_trace_problem(**_override_eval_kwargs):
+    agent = MultiParamAgent()
+    guide = SumGuide()
+    train_dataset = dict(inputs=[None], infos=[3.0])
+    optimizer_kwargs = dict(objective="Make a+b match the target value.", memory_size=5)
+    return dict(
+        param=agent,
+        guide=guide,
+        train_dataset=train_dataset,
+        optimizer_kwargs=optimizer_kwargs,
+        metadata=dict(benchmark="internal", entry="MultiParamAgent"),
+    )
+
+
+__all__ = ["build_trace_problem", "MultiParamAgent"]
diff --git a/trace_bench/examples/internal_non_trainable.py b/trace_bench/examples/internal_non_trainable.py
new file mode 100644
index 0000000..08cec8b
--- /dev/null
+++ b/trace_bench/examples/internal_non_trainable.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+from opto import trace
+from opto.trainer.guide import Guide
+
+
+class NoTrainGuide(Guide):
+    def get_feedback(self, _query, response, reference, **_kwargs):
+        score = 1.0 if response == reference else 0.0
+        feedback = "Correct" if score == 1.0 else "Mismatch"
+        return score, feedback
+
+
+@trace.model
+class NonTrainableAgent:
+    def __init__(self):
+        self.value = trace.node("fixed", trainable=False)
+
+    def __call__(self, _input):
+        return self.emit(self.value)
+
+    @trace.bundle(trainable=False)
+    def emit(self, value):
+        return value
+
+
+def build_trace_problem(**_override_eval_kwargs):
+    agent = NonTrainableAgent()
+    guide = NoTrainGuide()
+    train_dataset = dict(inputs=[None], infos=["fixed"])
+    optimizer_kwargs = dict(objective="This should fail due to no trainables.", memory_size=1)
+    return dict(
+        param=agent,
+        guide=guide,
+        train_dataset=train_dataset,
+        optimizer_kwargs=optimizer_kwargs,
+        metadata=dict(benchmark="internal", entry="NonTrainableAgent"),
+    )
+
+
+__all__ = ["build_trace_problem", "NonTrainableAgent"]
diff --git a/trace_bench/examples/internal_numeric_param.py b/trace_bench/examples/internal_numeric_param.py
new file mode 100644
index 0000000..22d1a21
--- /dev/null
+++ b/trace_bench/examples/internal_numeric_param.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+from opto import trace
+from opto.trainer.guide import Guide
+
+
+class NumericGuide(Guide):
+    def get_feedback(self, _query, response, reference, **_kwargs):
+        try:
+            score = -abs(float(response) - float(reference))
+        except Exception:
+            score = -1.0
+        feedback = f"target={reference}"
+        return score, feedback
+
+
+@trace.model
+class NumericParamAgent:
+    def __init__(self):
+        self.value = trace.node(0.0, trainable=True)
+
+    def __call__(self, _input):
+        return self.emit(self.value)
+
+    @trace.bundle(trainable=True)
+    def emit(self, value):
+        return value
+
+
+def build_trace_problem(**_override_eval_kwargs):
+    agent = NumericParamAgent()
+    guide = NumericGuide()
+    train_dataset = dict(inputs=[None], infos=[3.0])
+    optimizer_kwargs = dict(objective="Match the numeric target value.", memory_size=5)
+    return dict(
+        param=agent,
+        guide=guide,
+        train_dataset=train_dataset,
+        optimizer_kwargs=optimizer_kwargs,
+        metadata=dict(benchmark="internal", entry="NumericParamAgent"),
+    )
+
+
+__all__ = ["build_trace_problem", "NumericParamAgent"]
diff --git a/trace_bench/examples/train_single_node_stub.py b/trace_bench/examples/train_single_node_stub.py
new file mode 100644
index 0000000..e7f141f
--- /dev/null
+++ b/trace_bench/examples/train_single_node_stub.py
@@ -0,0 +1,50 @@
+﻿from __future__ import annotations
+
+from opto import trace
+from opto.trainer.guide import Guide
+
+
+class RegressionGuide(Guide):
+    def get_feedback(self, query, response, reference, **kwargs):
+        try:
+            score = -abs(float(response) - float(reference))
+        except Exception:
+            score = -1.0
+        feedback = f"target={reference}"
+        return score, feedback
+
+
+@trace.model
+class SingleNodeAgent:
+    def __init__(self):
+        self.guess = trace.node(0.0, trainable=True)
+
+    def __call__(self, _input):
+        return self.output(self.guess)
+
+    @trace.bundle(trainable=True)
+    def output(self, guess):
+        return guess
+
+
+def build_trace_problem(**override_eval_kwargs):
+    agent = SingleNodeAgent()
+    guide = RegressionGuide()
+    train_dataset = dict(
+        inputs=[None],
+        infos=[3.0],
+    )
+    optimizer_kwargs = dict(
+        objective="Match the target scalar value.",
+        memory_size=5,
+    )
+    return dict(
+        param=agent,
+        guide=guide,
+        train_dataset=train_dataset,
+        optimizer_kwargs=optimizer_kwargs,
+        metadata=dict(benchmark="example", entry="SingleNodeAgent"),
+    )
+
+
+__all__ = ["build_trace_problem", "SingleNodeAgent"]
diff --git a/trace_bench/matrix.py b/trace_bench/matrix.py
new file mode 100644
index 0000000..ea0f232
--- /dev/null
+++ b/trace_bench/matrix.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+import hashlib
+import json
+import subprocess
+
+from trace_bench.config import RunConfig, TaskConfig, TrainerConfig
+from trace_bench.resolve import resolve_trainer_kwargs
+
+
+def _git_sha() -> str:
+    try:
+        return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip()
+    except Exception:
+        return "unknown"
+
+
+def _stable_hash(payload: Dict[str, Any], length: int = 8) -> str:
+    data = json.dumps(payload, sort_keys=True, default=str).encode("utf-8")
+    return hashlib.sha256(data).hexdigest()[:length]
+
+
+def compute_run_id(config_snapshot: Dict[str, Any], git_sha: Optional[str] = None) -> str:
+    timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
+    payload = {"config": config_snapshot, "git": git_sha or _git_sha()}
+    return f"{timestamp}-{_stable_hash(payload, 8)}"
+
+
+def compute_job_id(task_id: str, trainer_id: str, resolved_kwargs: Dict[str, Any], seed: int) -> str:
+    payload = {
+        "task_id": task_id,
+        "trainer_id": trainer_id,
+        "resolved_kwargs": resolved_kwargs,
+        "seed": seed,
+    }
+    return _stable_hash(payload, 12)
+
+
+def task_suite(task_id: str) -> str:
+    if ":" in task_id:
+        return task_id.split(":", 1)[0]
+    return "llm4ad"
+
+
+def resolve_job_kwargs(task: TaskConfig, trainer: TrainerConfig, params: Dict[str, Any]) -> Dict[str, Any]:
+    return {
+        "trainer_kwargs": resolve_trainer_kwargs(params, trainer.id),
+        "optimizer": trainer.optimizer,
+        "optimizer_kwargs": dict(trainer.optimizer_kwargs or {}),
+        "guide": trainer.guide,
+        "guide_kwargs": dict(trainer.guide_kwargs or {}),
+        "logger": trainer.logger,
+        "logger_kwargs": dict(trainer.logger_kwargs or {}),
+        "eval_kwargs": dict(task.eval_kwargs or {}),
+    }
+
+
+@dataclass
+class JobSpec:
+    job_id: str
+    task: TaskConfig
+    trainer: TrainerConfig
+    seed: int
+    params: Dict[str, Any]
+    resolved_kwargs: Dict[str, Any]
+
+    @property
+    def task_id(self) -> str:
+        return self.task.id
+
+    @property
+    def trainer_id(self) -> str:
+        return self.trainer.id
+
+    @property
+    def suite(self) -> str:
+        return task_suite(self.task_id)
+
+
+def expand_matrix(config: RunConfig) -> List[JobSpec]:
+    jobs: List[JobSpec] = []
+    for task in config.tasks:
+        for trainer in config.trainers:
+            variants = trainer.params_variants or [{}]
+            for params in variants:
+                for seed in config.seeds:
+                    resolved = resolve_job_kwargs(task, trainer, params)
+                    job_id = compute_job_id(task.id, trainer.id, resolved, seed)
+                    jobs.append(
+                        JobSpec(
+                            job_id=job_id,
+                            task=task,
+                            trainer=trainer,
+                            seed=seed,
+                            params=params,
+                            resolved_kwargs=resolved,
+                        )
+                    )
+    return jobs
diff --git a/trace_bench/registry.py b/trace_bench/registry.py
new file mode 100644
index 0000000..8096e17
--- /dev/null
+++ b/trace_bench/registry.py
@@ -0,0 +1,284 @@
+﻿from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Set
+import ast
+import importlib
+import importlib.util
+import inspect
+import json
+import pkgutil
+import sys
+
+
+@dataclass
+class TaskSpec:
+    id: str
+    suite: str
+    module: str
+
+
+@dataclass
+class TrainerSpec:
+    id: str
+    source: str
+    available: bool
+
+
+_INTERNAL_TASKS = {
+    "internal:code_param": "internal_code_param",
+    "internal:numeric_param": "internal_numeric_param",
+    "internal:multi_param": "internal_multi_param",
+    "internal:non_trainable": "internal_non_trainable",
+}
+
+_TRAINER_ALIASES = {
+    "GEPAAlgorithmBase": "GEPA-Base",
+    "GEPAUCBSearch": "GEPA-UCB",
+    "GEPABeamPareto": "GEPA-Beam",
+}
+
+_VERIBENCH_UNAVAILABLE = (
+    "veribench_unavailable: entrypoint not available (install Veribench or provide task list)"
+)
+_VERIBENCH_PLACEHOLDER = "veribench:smoke_placeholder"
+
+def _repo_root() -> Path:
+    return Path(__file__).resolve().parents[1]
+
+
+def _ensure_sys_path(path: Path) -> None:
+    if path.exists():
+        path_str = str(path)
+        if path_str not in sys.path:
+            sys.path.insert(0, path_str)
+
+
+def ensure_opto_importable() -> None:
+    try:
+        import opto  # noqa: F401
+        return
+    except Exception:
+        pass
+    repo_root = _repo_root()
+    _ensure_sys_path(repo_root.parent / "OpenTrace")
+
+
+def ensure_llm4ad_importable(tasks_root: Path) -> None:
+    _ensure_sys_path(_repo_root())
+    _ensure_sys_path(tasks_root.parent)
+    # Provide llm4ad_loader alias for task imports
+    try:
+        module = importlib.import_module("LLM4AD.llm4ad_loader")
+        sys.modules.setdefault("llm4ad_loader", module)
+    except Exception:
+        pass
+
+
+def _load_index(tasks_root: Path) -> List[Dict[str, Any]]:
+    index_path = tasks_root / "index.json"
+    if not index_path.exists():
+        return []
+    return json.loads(index_path.read_text(encoding="utf-8"))
+
+
+def discover_llm4ad(tasks_root: Path) -> List[TaskSpec]:
+    specs: List[TaskSpec] = []
+    index = _load_index(tasks_root)
+    if index:
+        for entry in index:
+            key = entry.get("key")
+            module = entry.get("module") or entry.get("wrapper")
+            if key and module:
+                specs.append(TaskSpec(id=f"llm4ad:{key}", suite="llm4ad", module=module))
+        return specs
+    # fallback: directories
+    for path in tasks_root.iterdir():
+        if path.is_dir():
+            specs.append(TaskSpec(id=f"llm4ad:{path.name}", suite="llm4ad", module=path.name))
+    return specs
+
+
+def discover_trace_examples() -> List[TaskSpec]:
+    return [
+        TaskSpec(id="trace_examples:greeting_stub", suite="trace_examples", module="greeting_stub"),
+        TaskSpec(id="trace_examples:train_single_node_stub", suite="trace_examples", module="train_single_node_stub"),
+    ]
+
+
+def discover_internal() -> List[TaskSpec]:
+    return [
+        TaskSpec(id=task_id, suite="internal", module=module)
+        for task_id, module in _INTERNAL_TASKS.items()
+    ]
+
+def discover_veribench() -> List[TaskSpec]:
+    # Always return a placeholder task so CLI/validate can skip with a reason.
+    if importlib.util.find_spec("veribench") is None:
+        return [TaskSpec(id=_VERIBENCH_PLACEHOLDER, suite="veribench", module="veribench_unavailable")]
+    # Entry point not wired yet; keep placeholder until a task list is provided.
+    return [TaskSpec(id=_VERIBENCH_PLACEHOLDER, suite="veribench", module="veribench_unavailable")]
+
+
+def _iter_module_names(package_name: str) -> Iterable[str]:
+    try:
+        package = importlib.import_module(package_name)
+    except Exception:
+        return []
+    names: List[str] = [package.__name__]
+    if hasattr(package, "__path__"):
+        for module_info in pkgutil.walk_packages(package.__path__, package.__name__ + "."):
+            names.append(module_info.name)
+    return names
+
+
+def _class_names_from_file(module_name: str) -> List[str]:
+    spec = importlib.util.find_spec(module_name)
+    if spec is None or not spec.origin or not spec.origin.endswith(".py"):
+        return []
+    try:
+        source = Path(spec.origin).read_text(encoding="utf-8")
+        tree = ast.parse(source)
+    except Exception:
+        return []
+    names: List[str] = []
+    for node in tree.body:
+        if not isinstance(node, ast.ClassDef):
+            continue
+        base_names: List[str] = []
+        for base in node.bases:
+            if isinstance(base, ast.Name):
+                base_names.append(base.id)
+            elif isinstance(base, ast.Attribute):
+                base_names.append(base.attr)
+        if any(name.endswith("Trainer") or name.endswith("Algorithm") for name in base_names):
+            if node.name in {"Trainer", "Algorithm"}:
+                continue
+            names.append(node.name)
+    return names
+
+
+def discover_trainers() -> List[TrainerSpec]:
+    ensure_opto_importable()
+    from opto.trainer.algorithms.algorithm import Trainer as TrainerBase
+
+    specs: Dict[str, TrainerSpec] = {}
+    module_names: List[str] = []
+    module_names.extend(_iter_module_names("opto.trainer.algorithms"))
+    module_names.extend(_iter_module_names("opto.features"))
+
+    for module_name in sorted(set(module_names)):
+        try:
+            module = importlib.import_module(module_name)
+        except Exception:
+            for class_name in _class_names_from_file(module_name):
+                trainer_id = _TRAINER_ALIASES.get(class_name, class_name)
+                if trainer_id not in specs:
+                    specs[trainer_id] = TrainerSpec(id=trainer_id, source=module_name, available=False)
+            continue
+
+        for _name, obj in vars(module).items():
+            if not inspect.isclass(obj):
+                continue
+            if obj is TrainerBase:
+                continue
+            if not issubclass(obj, TrainerBase):
+                continue
+            trainer_id = _TRAINER_ALIASES.get(obj.__name__, obj.__name__)
+            specs[trainer_id] = TrainerSpec(id=trainer_id, source=obj.__module__, available=True)
+    return sorted(specs.values(), key=lambda spec: spec.id)
+
+
+def _parse_bench(bench: Optional[str]) -> Set[str]:
+    if not bench:
+        return {"llm4ad", "trace_examples", "internal"}
+    normalized = bench.replace("+", ",")
+    parts = [p.strip() for p in normalized.split(",") if p.strip()]
+    if not parts:
+        return {"llm4ad", "trace_examples", "internal"}
+    allowed = {"llm4ad", "trace_examples", "internal", "veribench"}
+    unknown = [p for p in parts if p not in allowed]
+    if unknown:
+        raise ValueError(f"Unknown bench selector(s): {unknown}. Allowed: {sorted(allowed)}")
+    return set(parts)
+
+
+def discover_tasks(tasks_root: str | Path, bench: Optional[str] = None) -> List[TaskSpec]:
+    root = Path(tasks_root)
+    selected = _parse_bench(bench)
+    specs: List[TaskSpec] = []
+    if "llm4ad" in selected:
+        specs.extend(discover_llm4ad(root))
+    if "trace_examples" in selected:
+        specs.extend(discover_trace_examples())
+    if "internal" in selected:
+        specs.extend(discover_internal())
+    if "veribench" in selected:
+        specs.extend(discover_veribench())
+    return specs
+
+
+def _normalize_task_id(task_id: str) -> str:
+    if task_id.startswith("example:"):
+        return task_id.replace("example:", "trace_examples:", 1)
+    if ":" in task_id:
+        return task_id
+    return f"llm4ad:{task_id}"
+
+
+def load_task_module(task_id: str, tasks_root: str | Path):
+    ensure_opto_importable()
+    root = Path(tasks_root)
+    task_id = _normalize_task_id(task_id)
+    if task_id.startswith("trace_examples:"):
+        module_name = task_id.split(":", 1)[1]
+        return importlib.import_module(f"trace_bench.examples.{module_name}")
+    if task_id.startswith("internal:"):
+        module_name = _INTERNAL_TASKS.get(task_id, task_id.split(":", 1)[1])
+        return importlib.import_module(f"trace_bench.examples.{module_name}")
+    if task_id.startswith("veribench:"):
+        raise NotImplementedError(_VERIBENCH_UNAVAILABLE)
+
+    ensure_llm4ad_importable(root)
+    mapping = {spec.id.split(":", 1)[1]: spec.module for spec in discover_llm4ad(root)}
+    task_key = task_id.split(":", 1)[1]
+    module_dir = mapping.get(task_key, task_key)
+    module_path = root / module_dir / "__init__.py"
+    if not module_path.exists():
+        raise FileNotFoundError(f"Task module not found: {module_path}")
+
+    module_name = f"trace_bench_task_{module_dir}_{abs(hash(str(module_path)))}"
+    spec = importlib.util.spec_from_file_location(module_name, str(module_path))
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Could not load spec for {module_path}")
+    mod = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = mod
+    spec.loader.exec_module(mod)
+    return mod
+
+
+def load_task_bundle(task_id: str, tasks_root: str | Path, eval_kwargs: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    task_id = _normalize_task_id(task_id)
+    if task_id.startswith("veribench:"):
+        raise NotImplementedError(_VERIBENCH_UNAVAILABLE)
+    mod = load_task_module(task_id, tasks_root)
+    if not hasattr(mod, "build_trace_problem"):
+        raise AttributeError(f"Task module {task_id} missing build_trace_problem")
+    bundle = mod.build_trace_problem(**(eval_kwargs or {}))
+    required = {"param", "guide", "train_dataset", "optimizer_kwargs", "metadata"}
+    missing = required - set(bundle.keys())
+    if missing:
+        raise KeyError(f"Task bundle missing keys: {sorted(missing)}")
+    return bundle
+
+
+__all__ = [
+    "TaskSpec",
+    "TrainerSpec",
+    "discover_tasks",
+    "discover_trainers",
+    "discover_veribench",
+    "load_task_bundle",
+    "load_task_module",
+]
diff --git a/trace_bench/resolve.py b/trace_bench/resolve.py
new file mode 100644
index 0000000..e285341
--- /dev/null
+++ b/trace_bench/resolve.py
@@ -0,0 +1,95 @@
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+
+_FILTERED_KWARGS = {"eval_kwargs", "optimizer_kwargs"}
+
+
+def _default_trainer_kwargs(algo_name: str) -> Dict[str, Any]:
+    if algo_name == "PrioritySearch":
+        return dict(num_epochs=1, num_steps=1, num_batches=1, num_candidates=2, num_proposals=2)
+    if algo_name == "GEPA-Base":
+        return dict(num_iters=1, train_batch_size=2, merge_every=2, pareto_subset_size=2)
+    # GEPA-UCB and GEPA-Beam use num_search_iterations
+    return dict(num_search_iterations=1, train_batch_size=2, merge_every=2, pareto_subset_size=2)
+
+
+def _param_alias_map(algo_name: str) -> Dict[str, str]:
+    base = {
+        "threads": "num_threads",
+        "ps_steps": "num_steps",
+        "ps_batches": "num_batches",
+        "ps_candidates": "num_candidates",
+        "ps_proposals": "num_proposals",
+        "ps_mem_update": "memory_update_frequency",
+        "gepa_train_bs": "train_batch_size",
+        "gepa_merge_every": "merge_every",
+        "gepa_pareto_subset": "pareto_subset_size",
+    }
+    if algo_name == "GEPA-Base":
+        base["gepa_iters"] = "num_iters"
+    else:
+        base["gepa_iters"] = "num_search_iterations"
+    return base
+
+
+def resolve_trainer_kwargs(params: Dict[str, Any], algo_name: str) -> Dict[str, Any]:
+    kwargs = _default_trainer_kwargs(algo_name)
+    alias_map = _param_alias_map(algo_name)
+    for key, value in (params or {}).items():
+        if key in _FILTERED_KWARGS:
+            continue
+        mapped_key = alias_map.get(key, key)
+        kwargs[mapped_key] = value
+    return kwargs
+
+
+def _clone(value: Any) -> Any:
+    if isinstance(value, dict):
+        return {k: _clone(v) for k, v in value.items()}
+    if isinstance(value, list):
+        return [_clone(v) for v in value]
+    return value
+
+
+def merge_kwargs(base: Any, override: Any) -> Any:
+    if override is None:
+        return _clone(base)
+    if base is None:
+        return _clone(override)
+    if isinstance(base, dict) and isinstance(override, dict):
+        merged = dict(base)
+        merged.update(override)
+        return merged
+    if isinstance(base, list) and isinstance(override, dict):
+        if not base:
+            return [_clone(override)]
+        return [
+            merge_kwargs(item, override) if isinstance(item, (dict, list)) else _clone(item)
+            for item in base
+        ]
+    if isinstance(base, dict) and isinstance(override, list):
+        if not override:
+            return _clone(base)
+        return [
+            merge_kwargs(base, item) if isinstance(item, (dict, list)) else _clone(item)
+            for item in override
+        ]
+    if isinstance(base, list) and isinstance(override, list):
+        merged: List[Any] = []
+        max_len = max(len(base), len(override))
+        for idx in range(max_len):
+            left = base[idx] if idx < len(base) else None
+            right = override[idx] if idx < len(override) else None
+            if left is None:
+                merged.append(_clone(right))
+            elif right is None:
+                merged.append(_clone(left))
+            else:
+                merged.append(merge_kwargs(left, right))
+        return merged
+    return _clone(override)
+
+
+__all__ = ["resolve_trainer_kwargs", "merge_kwargs"]
diff --git a/trace_bench/results.py b/trace_bench/results.py
new file mode 100644
index 0000000..2e307c6
--- /dev/null
+++ b/trace_bench/results.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+from typing import Any, Dict, List
+import json
+
+from trace_bench.artifacts import sanitize_for_json
+
+
+RESULT_COLUMNS = [
+    "run_id",
+    "job_id",
+    "task_id",
+    "suite",
+    "trainer_id",
+    "seed",
+    "status",
+    "score_initial",
+    "score_final",
+    "score_best",
+    "time_seconds",
+    "resolved_trainer_kwargs",
+    "resolved_optimizer_kwargs",
+    "eval_kwargs",
+    "feedback",
+    "tb_logdir",
+]
+
+
+def _json_cell(value: Any) -> str:
+    return json.dumps(sanitize_for_json(value), sort_keys=True, ensure_ascii=False)
+
+
+def build_results_row(
+    run_id: str,
+    job_id: str,
+    task_id: str,
+    suite: str,
+    trainer_id: str,
+    seed: int,
+    status: str,
+    score_initial: Any,
+    score_final: Any,
+    score_best: Any,
+    time_seconds: float,
+    resolved_trainer_kwargs: Dict[str, Any],
+    resolved_optimizer_kwargs: Dict[str, Any],
+    eval_kwargs: Dict[str, Any],
+    feedback: str | None,
+    tb_logdir: str,
+) -> Dict[str, Any]:
+    return {
+        "run_id": run_id,
+        "job_id": job_id,
+        "task_id": task_id,
+        "suite": suite,
+        "trainer_id": trainer_id,
+        "seed": seed,
+        "status": status,
+        "score_initial": score_initial,
+        "score_final": score_final,
+        "score_best": score_best,
+        "time_seconds": round(time_seconds, 6),
+        "resolved_trainer_kwargs": resolved_trainer_kwargs,
+        "resolved_optimizer_kwargs": resolved_optimizer_kwargs,
+        "eval_kwargs": eval_kwargs,
+        "feedback": feedback or "",
+        "tb_logdir": tb_logdir,
+    }
+
+
+def build_results_csv_row(row: Dict[str, Any]) -> Dict[str, Any]:
+    csv_row = dict(row)
+    csv_row["resolved_trainer_kwargs"] = _json_cell(row.get("resolved_trainer_kwargs"))
+    csv_row["resolved_optimizer_kwargs"] = _json_cell(row.get("resolved_optimizer_kwargs"))
+    csv_row["eval_kwargs"] = _json_cell(row.get("eval_kwargs"))
+    return csv_row
+
+
+def summarize_results(rows: List[Dict[str, Any]]) -> Dict[str, Any]:
+    counts: Dict[str, int] = {"ok": 0, "failed": 0, "skipped": 0}
+    for row in rows:
+        status = row.get("status") or "ok"
+        if status not in counts:
+            counts[status] = 0
+        counts[status] += 1
+    return {"counts": counts, "total_jobs": len(rows)}
+
+
+__all__ = ["RESULT_COLUMNS", "build_results_row", "build_results_csv_row", "summarize_results"]
diff --git a/trace_bench/runner.py b/trace_bench/runner.py
new file mode 100644
index 0000000..4a8f879
--- /dev/null
+++ b/trace_bench/runner.py
@@ -0,0 +1,361 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import json
+import random
+import time
+
+from trace_bench.artifacts import (
+    RunArtifacts,
+    append_event,
+    append_results_csv,
+    init_job_dir,
+    init_run_dir,
+    write_config_snapshot,
+    write_env_json,
+    write_git_json,
+    write_manifest,
+    write_job_meta,
+    write_job_results,
+    write_summary,
+)
+from trace_bench.config import RunConfig, TaskConfig, TrainerConfig
+from trace_bench.matrix import JobSpec, compute_run_id, expand_matrix
+from trace_bench.registry import load_task_bundle
+from trace_bench.resolve import merge_kwargs, resolve_trainer_kwargs
+from trace_bench.results import RESULT_COLUMNS, build_results_csv_row, build_results_row, summarize_results
+
+
+try:
+    from opto.trace.nodes import ParameterNode
+except Exception:  # pragma: no cover - only when opto is not available
+    ParameterNode = object  # type: ignore
+
+
+@dataclass
+class RunSummary:
+    run_id: str
+    results: List[Dict[str, Any]]
+
+
+def _extract_response(model: Any, input_value: Any) -> Any:
+    if isinstance(model, ParameterNode):
+        return getattr(model, "data", model)
+    if callable(model):
+        output = model(input_value)
+        return getattr(output, "data", output)
+    return getattr(model, "data", model)
+
+
+def _evaluate_bundle(bundle: Dict[str, Any]) -> Dict[str, Any]:
+    dataset = bundle["train_dataset"]
+    guide = bundle["guide"]
+    inputs = dataset.get("inputs") or []
+    infos = dataset.get("infos") or []
+    if not inputs or not infos:
+        return {"score": None, "feedback": "empty_dataset"}
+    task_input = inputs[0]
+    task_info = infos[0]
+    response = _extract_response(bundle["param"], task_input)
+    try:
+        score, feedback = guide(task_input, response, task_info)
+    except Exception as exc:
+        return {"score": None, "feedback": f"eval_error: {exc}"}
+    return {"score": score, "feedback": feedback}
+
+
+def _resolve_algorithm(name: str):
+    if name == "PrioritySearch":
+        return "PrioritySearch"
+    if name == "GEPA-Base":
+        from opto.features.gepa.gepa_algorithms import GEPAAlgorithmBase
+        return GEPAAlgorithmBase
+    if name == "GEPA-UCB":
+        from opto.features.gepa.gepa_algorithms import GEPAUCBSearch
+        return GEPAUCBSearch
+    if name == "GEPA-Beam":
+        from opto.features.gepa.gepa_algorithms import GEPABeamPareto
+        return GEPABeamPareto
+    return name
+
+
+def _train_bundle(bundle: Dict[str, Any], trainer_spec: TrainerConfig, params: Dict[str, Any], mode: str) -> Dict[str, Any]:
+    from opto import trainer as opto_trainer
+
+    algo_name = trainer_spec.id
+    algo = _resolve_algorithm(algo_name)
+    kwargs = resolve_trainer_kwargs(params, algo_name)
+
+    optimizer = trainer_spec.optimizer
+    guide = trainer_spec.guide or bundle["guide"]
+    logger = trainer_spec.logger or "ConsoleLogger"
+    guide_kwargs = merge_kwargs(bundle.get("guide_kwargs"), trainer_spec.guide_kwargs or {})
+    logger_kwargs = merge_kwargs(bundle.get("logger_kwargs"), trainer_spec.logger_kwargs or {})
+
+    optimizer_kwargs = merge_kwargs(bundle.get("optimizer_kwargs", {}), trainer_spec.optimizer_kwargs or {})
+
+    if mode == "stub":
+        try:
+            from opto.utils.llm import DummyLLM
+
+            def _dummy_response(*_args, **_kwargs):
+                return '{"suggestion": {}}'
+
+            dummy = DummyLLM(_dummy_response)
+            if isinstance(optimizer_kwargs, list):
+                for item in optimizer_kwargs:
+                    item.setdefault("llm", dummy)
+            elif isinstance(optimizer_kwargs, dict):
+                optimizer_kwargs.setdefault("llm", dummy)
+        except Exception:
+            pass
+
+    try:
+        opto_trainer.train(
+            model=bundle["param"],
+            train_dataset=bundle["train_dataset"],
+            algorithm=algo,
+            guide=guide,
+            optimizer=optimizer,
+            logger=logger,
+            optimizer_kwargs=optimizer_kwargs,
+            guide_kwargs=guide_kwargs,
+            logger_kwargs=logger_kwargs,
+            **kwargs,
+        )
+        return {"status": "ok", "optimizer_kwargs": optimizer_kwargs, "trainer_kwargs": kwargs}
+    except Exception as exc:
+        return {"status": "failed", "error": str(exc), "optimizer_kwargs": optimizer_kwargs, "trainer_kwargs": kwargs}
+
+
+def _has_trainables(model: Any) -> bool:
+    if isinstance(model, ParameterNode):
+        return bool(getattr(model, "trainable", True))
+    if hasattr(model, "parameters"):
+        try:
+            params = model.parameters()
+            return any(getattr(p, "trainable", False) for p in params)
+        except Exception:
+            return True
+    return True
+
+
+class BenchRunner:
+    def __init__(self, config: RunConfig, tasks_root: str | Path = "LLM4AD/benchmark_tasks"):
+        self.config = config
+        self.tasks_root = Path(tasks_root)
+        random.seed(self.config.seeds[0] if self.config.seeds else 123)
+        self.artifacts: Optional[RunArtifacts] = None
+        self._bundle_cache: Dict[str, Dict[str, Any]] = {}
+
+    def _bundle_cache_key(self, task: TaskConfig) -> str:
+        eval_sig = json.dumps(task.eval_kwargs or {}, sort_keys=True)
+        return f"{task.id}|{eval_sig}"
+
+    def _get_bundle(self, task: TaskConfig) -> Tuple[str, Optional[Dict[str, Any]], Optional[str]]:
+        key = self._bundle_cache_key(task)
+        if key in self._bundle_cache:
+            cached = self._bundle_cache[key]
+            return cached["status"], cached.get("bundle"), cached.get("error")
+        try:
+            bundle = load_task_bundle(task.id, self.tasks_root, eval_kwargs=task.eval_kwargs)
+            entry = {"status": "ok", "bundle": bundle, "error": None}
+        except NotImplementedError as exc:
+            entry = {"status": "skipped", "bundle": None, "error": str(exc)}
+        except Exception as exc:
+            entry = {"status": "failed", "bundle": None, "error": f"task_load_error: {exc}"}
+        self._bundle_cache[key] = entry
+        return entry["status"], entry.get("bundle"), entry.get("error")
+
+    def run(self) -> RunSummary:
+        snapshot = self.config.snapshot()
+        run_id = self.config.run_id or compute_run_id({k: v for k, v in snapshot.items() if k != "run_id"})
+        self.config.run_id = run_id
+        snapshot = self.config.snapshot()
+
+        self.artifacts = init_run_dir(self.config.runs_dir, run_id)
+        write_config_snapshot(self.artifacts.config_snapshot, snapshot)
+        write_env_json(self.artifacts.env_json)
+        write_git_json(self.artifacts.git_json)
+
+        jobs = expand_matrix(self.config)
+
+        results: List[Dict[str, Any]] = []
+        manifest_jobs: List[Dict[str, Any]] = []
+        for job in jobs:
+            row, manifest_job = self._run_job(job)
+            results.append(row)
+            manifest_jobs.append(manifest_job)
+            if self.config.fail_fast and row.get("status") == "failed":
+                break
+
+        recorded_job_ids = {entry["job_id"] for entry in manifest_jobs}
+        for job in jobs:
+            if job.job_id in recorded_job_ids:
+                continue
+            status_hint, bundle, skip_reason = self._get_bundle(job.task)
+            resolved_trainer_kwargs = resolve_trainer_kwargs(job.params, job.trainer_id)
+            resolved_optimizer_kwargs = merge_kwargs(
+                bundle.get("optimizer_kwargs", {}) if bundle else {},
+                job.trainer.optimizer_kwargs or {},
+            )
+            resolved_guide_kwargs = merge_kwargs(
+                bundle.get("guide_kwargs") if bundle else {},
+                job.trainer.guide_kwargs or {},
+            )
+            resolved_logger_kwargs = merge_kwargs(
+                bundle.get("logger_kwargs") if bundle else {},
+                job.trainer.logger_kwargs or {},
+            )
+            manifest_jobs.append(
+                {
+                    "job_id": job.job_id,
+                    "task_id": job.task_id,
+                    "suite": job.suite,
+                    "trainer_id": job.trainer_id,
+                    "seed": job.seed,
+                    "raw_params": dict(job.params),
+                    "resolved_trainer_kwargs": resolved_trainer_kwargs,
+                    "resolved_optimizer_kwargs": resolved_optimizer_kwargs,
+                    "resolved_guide_kwargs": resolved_guide_kwargs,
+                    "resolved_logger_kwargs": resolved_logger_kwargs,
+                    "eval_kwargs": dict(job.task.eval_kwargs or {}),
+                    "status": "not_executed",
+                    "status_hint": status_hint,
+                    "skip_reason": skip_reason or "fail_fast_stopped",
+                }
+            )
+
+        manifest = {
+            "run_id": run_id,
+            "generated_at": datetime.utcnow().isoformat() + "Z",
+            "jobs": manifest_jobs,
+        }
+        write_manifest(self.artifacts.manifest_json, manifest)
+
+        write_summary(self.artifacts.summary_json, summarize_results(results))
+        return RunSummary(run_id=run_id, results=results)
+
+    def _run_job(self, job: JobSpec) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        assert self.artifacts is not None
+        job_artifacts = init_job_dir(self.artifacts, job.job_id)
+        start_time = time.time()
+        status = "ok"
+        feedback: Optional[str] = None
+
+        status_hint, bundle, bundle_error = self._get_bundle(job.task)
+        if status_hint != "ok":
+            status = status_hint
+            feedback = bundle_error
+
+        score_initial = None
+        score_final = None
+        score_best = None
+        resolved_trainer_kwargs: Dict[str, Any] = resolve_trainer_kwargs(job.params, job.trainer_id)
+        resolved_optimizer_kwargs: Dict[str, Any] = dict(job.trainer.optimizer_kwargs or {})
+        resolved_guide_kwargs = merge_kwargs({}, job.trainer.guide_kwargs)
+        resolved_logger_kwargs = merge_kwargs({}, job.trainer.logger_kwargs)
+
+        if bundle is not None and status == "ok":
+            resolved_optimizer_kwargs = merge_kwargs(
+                bundle.get("optimizer_kwargs", {}), job.trainer.optimizer_kwargs or {}
+            )
+            resolved_guide_kwargs = merge_kwargs(
+                bundle.get("guide_kwargs"),
+                job.trainer.guide_kwargs,
+            )
+            resolved_logger_kwargs = merge_kwargs(
+                bundle.get("logger_kwargs"),
+                job.trainer.logger_kwargs,
+            )
+            if not _has_trainables(bundle["param"]):
+                status = "failed"
+                feedback = "no_trainable_parameters"
+            else:
+                initial = _evaluate_bundle(bundle)
+                score_initial = initial.get("score")
+                train_result = _train_bundle(bundle, job.trainer, job.params, self.config.mode)
+                status = train_result.get("status", "ok")
+                resolved_optimizer_kwargs = train_result.get("optimizer_kwargs") or resolved_optimizer_kwargs
+                resolved_trainer_kwargs = train_result.get("trainer_kwargs") or resolved_trainer_kwargs
+                if status == "failed":
+                    feedback = f"training_error: {train_result.get('error', 'unknown')}"
+                final = _evaluate_bundle(bundle)
+                score_final = final.get("score")
+                if status != "failed":
+                    feedback = final.get("feedback") or feedback
+
+                if isinstance(score_initial, (int, float)) and isinstance(score_final, (int, float)):
+                    score_best = max(score_initial, score_final)
+                else:
+                    score_best = score_final if score_final is not None else score_initial
+
+        elapsed = time.time() - start_time
+        tb_rel = str(Path("jobs") / job.job_id / "tb")
+        row = build_results_row(
+            run_id=self.config.run_id or "",
+            job_id=job.job_id,
+            task_id=job.task_id,
+            suite=job.suite,
+            trainer_id=job.trainer_id,
+            seed=job.seed,
+            status=status,
+            score_initial=score_initial,
+            score_final=score_final,
+            score_best=score_best,
+            time_seconds=elapsed,
+            resolved_trainer_kwargs=resolved_trainer_kwargs,
+            resolved_optimizer_kwargs=resolved_optimizer_kwargs,
+            eval_kwargs=job.task.eval_kwargs,
+            feedback=feedback,
+            tb_logdir=tb_rel,
+        )
+        job_meta = {
+            "job_id": job.job_id,
+            "task_id": job.task_id,
+            "suite": job.suite,
+            "trainer_id": job.trainer_id,
+            "seed": job.seed,
+            "status": status,
+            "raw_params": dict(job.params),
+            "params": job.params,
+            "resolved_trainer_kwargs": resolved_trainer_kwargs,
+            "resolved_optimizer_kwargs": resolved_optimizer_kwargs,
+            "resolved_guide_kwargs": resolved_guide_kwargs,
+            "resolved_logger_kwargs": resolved_logger_kwargs,
+            "optimizer": job.trainer.optimizer,
+            "optimizer_kwargs": job.trainer.optimizer_kwargs,
+            "guide": job.trainer.guide,
+            "guide_kwargs": job.trainer.guide_kwargs,
+            "logger": job.trainer.logger,
+            "logger_kwargs": job.trainer.logger_kwargs,
+            "eval_kwargs": job.task.eval_kwargs,
+            "feedback": feedback or "",
+            "tb_logdir": tb_rel,
+        }
+        write_job_meta(job_artifacts.job_meta, job_meta)
+        append_results_csv(self.artifacts.results_csv, RESULT_COLUMNS, build_results_csv_row(row))
+        append_event(job_artifacts.events_jsonl, row)
+        write_job_results(job_artifacts.results_json, row)
+        manifest_job = {
+            "job_id": job.job_id,
+            "task_id": job.task_id,
+            "suite": job.suite,
+            "trainer_id": job.trainer_id,
+            "seed": job.seed,
+            "raw_params": dict(job.params),
+            "resolved_trainer_kwargs": resolved_trainer_kwargs,
+            "resolved_optimizer_kwargs": resolved_optimizer_kwargs,
+            "resolved_guide_kwargs": resolved_guide_kwargs,
+            "resolved_logger_kwargs": resolved_logger_kwargs,
+            "eval_kwargs": dict(job.task.eval_kwargs or {}),
+            "status": status,
+            "feedback": feedback or "",
+        }
+        return row, manifest_job
+
+
+__all__ = ["BenchRunner", "RunSummary"]
diff --git a/trace_bench/tasks.py b/trace_bench/tasks.py
new file mode 100644
index 0000000..4013d2f
--- /dev/null
+++ b/trace_bench/tasks.py
@@ -0,0 +1,5 @@
+﻿"""Backward-compatible task helpers. Use trace_bench.registry instead."""
+
+from .registry import discover_tasks, load_task_bundle, load_task_module, TaskSpec
+
+__all__ = ["discover_tasks", "load_task_bundle", "load_task_module", "TaskSpec"]
diff --git a/trace_bench/ui.py b/trace_bench/ui.py
new file mode 100644
index 0000000..f2090e6
--- /dev/null
+++ b/trace_bench/ui.py
@@ -0,0 +1,60 @@
+﻿from __future__ import annotations
+
+from pathlib import Path
+import csv
+import json
+
+
+def _read_text(path: Path) -> str:
+    try:
+        return path.read_text(encoding="utf-8")
+    except Exception:
+        return ""
+
+
+def _read_csv(path: Path):
+    if not path.exists():
+        return []
+    with path.open("r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        return list(reader)
+
+
+def launch_ui(runs_dir: str) -> int:
+    try:
+        import gradio as gr
+    except Exception:
+        print("Gradio is not installed. Install with: pip install gradio")
+        return 1
+
+    runs_root = Path(runs_dir)
+    runs = sorted([p.name for p in runs_root.iterdir() if p.is_dir()]) if runs_root.exists() else []
+
+    def load_run(run_id: str):
+        run_path = runs_root / run_id
+        config_text = _read_text(run_path / "meta" / "config.snapshot.yaml")
+        results = _read_csv(run_path / "results.csv")
+        env_text = _read_text(run_path / "meta" / "env.json")
+        return config_text, results, env_text
+
+    with gr.Blocks() as demo:
+        gr.Markdown("# Trace-Bench UI (Stub)")
+        gr.Markdown("Select a run to view config, results, and env info.")
+        run_selector = gr.Dropdown(choices=runs, label="Run ID")
+        config_box = gr.Code(label="config.snapshot.yaml", language="yaml")
+        results_df = gr.Dataframe(label="results.csv")
+        env_box = gr.Code(label="env.json", language="json")
+
+        run_selector.change(load_run, inputs=run_selector, outputs=[config_box, results_df, env_box])
+
+        try:
+            import mlflow  # noqa: F401
+            gr.Markdown("MLflow detected. Full integration is pending (M3).")
+        except Exception:
+            gr.Markdown("MLflow not installed. Install if you want UI-linked runs.")
+
+    demo.launch()
+    return 0
+
+
+__all__ = ["launch_ui"]