diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml
deleted file mode 100644
index 3b873bc1..00000000
--- a/.github/workflows/claude-code-review.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-name: Claude Code Review
-on:
-  pull_request:
-    types: [opened, synchronize, ready_for_review, reopened]
-
-jobs:
-  claude-review:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      pull-requests: read
-      issues: read
-      id-token: write
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 1
-
-      - name: Run Claude Code Review
-        id: claude-review
-        uses: anthropics/claude-code-action@v1
-        with:
-          claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
-          plugin_marketplaces: 'https://github.com/anthropics/claude-code.git'
-          plugins: 'code-review@claude-code-plugins'
-          prompt: '/code-review:code-review ${{ github.repository }}/pull/${{ github.event.pull_request.number }}'
diff --git a/.gitignore b/.gitignore
index 7b5a45c4..9c24afdf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -222,4 +222,7 @@ downloads/
 benchmarks/
 routine_output/
 bluebox_workspace/
-api_indexing_output/
\ No newline at end of file
+api_indexing_output/
+api_indexing_output*/
+agent_workspace/
+agent_workspace*/
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
index 5417a4ad..36629d9e 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -24,7 +24,8 @@ This file provides context and guidelines for working with the bluebox codebase.
 - `bluebox-monitor --host 127.0.0.1 --port 9222 --output-dir ./cdp_captures --url about:blank --incognito` - Start browser monitoring
 - `bluebox-discover --task "your task description" --cdp-captures-dir ./cdp_captures --output-dir ./routine_discovery_output --llm-model gpt-5.2` - Discover routines from captures
 - `bluebox-execute --routine-path example_data/example_routines/amtrak_one_way_train_search_routine.json --parameters-path example_data/example_routines/amtrak_one_way_train_search_input.json` - Execute a routine
-- `bluebox-agent-adapter --agent RoutineDiscoveryAgentBeta --cdp-captures-dir ./cdp_captures` - Start HTTP adapter for programmatic agent interaction (see Agent HTTP Adapter section below)
+- `bluebox-api-index --cdp-captures-dir ./cdp_captures --task "your task" --output-dir ./api_indexing_output --model gpt-5.2 --post-run-analysis` - Run the API indexing pipeline (exploration + routine construction)
+- `bluebox-agent-adapter --agent NetworkSpecialist --cdp-captures-dir ./cdp_captures` - Start HTTP adapter for programmatic agent interaction (see Agent HTTP Adapter section below)
 - `bluebox-agent-adapter --list-agents` - List all available agents and their required data
 
 ### Chrome Debug Mode
@@ -107,23 +108,35 @@ This file provides context and guidelines for working with the bluebox codebase.
 - `bluebox/utils/js_utils.py` - JavaScript code generation
 - `bluebox/utils/web_socket_utils.py` - WebSocket utilities for CDP
 - `bluebox/sdk/client.py` - Main SDK client
+- `bluebox/workspace.py` - Agent workspace (artifact-oriented file I/O with provenance tracking)
 
 ### Agents
 
-AI agents that power routine discovery and conversational interactions:
+AI agents that power routine discovery, API indexing, and conversational interactions. All agents inherit from `AbstractAgent` (`bluebox/agents/abstract_agent.py`).
 
+**Core agents:**
 - `bluebox/agents/routine_discovery_agent.py` - Analyzes CDP captures to generate routines (identifies transactions, extracts/resolves variables, constructs operations)
 - `bluebox/agents/guide_agent.py` - Conversational agent for guiding users through routine creation/editing (maintains chat history, dynamic tool registration)
+- `bluebox/agents/bluebox_agent.py` - General-purpose conversational agent
+
+**API Indexing Pipeline agents:**
+- `bluebox/agents/principal_investigator.py` - Orchestrator: plans routine catalog, dispatches experiments to workers, reviews results, assembles and ships routines
+- `bluebox/agents/workers/experiment_worker.py` - Browser-capable execution agent: live browser tools + recorded capture lookup tools, executes experiments
+- `bluebox/agents/routine_inspector.py` - Independent quality gate: scores routines on 6 dimensions, hard-fails on 4xx/5xx or unresolved placeholders
+
+**Specialists** (domain-specific agents for exploration):
+- `bluebox/agents/specialists/network_specialist.py` - Network traffic analysis
+- `bluebox/agents/specialists/dom_specialist.py` - DOM structure analysis
+- `bluebox/agents/specialists/interaction_specialist.py` - UI interaction analysis
+- `bluebox/agents/specialists/js_specialist.py` - JavaScript file analysis
+- `bluebox/agents/specialists/value_trace_resolver_specialist.py` - Storage & window property analysis
 
 **Agent HTTP Adapter** (`bluebox/scripts/agent_http_adapter.py`):
 
-HTTP wrapper that exposes any `AbstractAgent` (or `AbstractSpecialist`) subclass as a JSON API, enabling programmatic interaction via curl. Agents are auto-discovered at runtime — adding a new `AbstractSpecialist` subclass makes it available with zero adapter changes.
+HTTP wrapper that exposes any `AbstractAgent` subclass as a JSON API, enabling programmatic interaction via curl. Agents are auto-discovered at runtime — adding a new `AbstractAgent` subclass makes it available with zero adapter changes.
 
 ```bash
-# Start adapter (default: RoutineDiscoveryAgentBeta)
-bluebox-agent-adapter --cdp-captures-dir ./cdp_captures --port 8765 -q
-
-# Or pick a specific agent
+# Start adapter with a specific agent
 bluebox-agent-adapter --agent NetworkSpecialist --cdp-captures-dir ./cdp_captures
 
 # Agents with no data requirements (e.g. BlueBoxAgent) don't need --cdp-captures-dir
@@ -134,7 +147,7 @@ Endpoints:
 - `GET /health` — liveness check
 - `GET /status` — agent type, chat state, discovery support
 - `POST /chat {"message": "..."}` — send a chat message (all agents)
-- `POST /discover {"task": "..."}` — run discovery/autonomous mode (specialists + RoutineDiscoveryAgentBeta)
+- `POST /discover {"task": "..."}` — run discovery/autonomous mode
 - `GET /routine` — retrieve discovered routine JSON
 
 **Best practices when calling from Claude Code or scripts:**
@@ -147,6 +160,7 @@ Endpoints:
 **LLM Infrastructure:**
 - `bluebox/llms/data_loaders/` - Specialized data loaders for CDP capture analysis:
   - `NetworkDataLoader` - HTTP request/response transactions
+  - `DOMDataLoader` - DOM snapshots (string-interning tables, element classification by tag family)
   - `JSDataLoader` - JavaScript files
   - `StorageDataLoader` - Cookies, localStorage, sessionStorage, IndexedDB
   - `WindowPropertyDataLoader` - Window property changes
@@ -156,18 +170,50 @@ Endpoints:
 
 **Import patterns:**
 ```python
+from bluebox.agents.abstract_agent import AbstractAgent, agent_tool, AgentCard
 from bluebox.agents.guide_agent import GuideAgent
 from bluebox.agents.routine_discovery_agent import RoutineDiscoveryAgent
+from bluebox.agents.principal_investigator import PrincipalInvestigator
+from bluebox.agents.workers.experiment_worker import ExperimentWorker
+from bluebox.agents.routine_inspector import RoutineInspector
+from bluebox.workspace import AgentWorkspace, LocalAgentWorkspace
 from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
+from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader
 from bluebox.llms.data_loaders.js_data_loader import JSDataLoader
 ```
 
+### Workspace
+
+The workspace (`bluebox/workspace.py`) is an artifact-oriented file I/O system attached to agents. Each workspace has a strict directory layout:
+
+- `raw/` (read-only): tool result artifacts and mounted external files
+- `output/`: agent-generated deliverables
+- `context/`: reusable notes/context saved for later use in the same run
+- `meta/`: system-managed metadata (`manifest.jsonl`, `input_mounts.jsonl`) — not editable
+- `scratch/`: ephemeral scratch space
+
+External files (e.g. CDP capture JSONL) can be mounted into `raw/` via hardlinks using `attach_input_file()`. The `save_artifact()` API records provenance in `meta/manifest.jsonl` (SHA-256, size, content type, timestamp).
+
+### API Indexing Pipeline
+
+End-to-end pipeline (`bluebox-api-index`) that turns raw CDP captures into a catalog of executable routines.
+
+**Phase 1 — Exploration** (4 specialists in parallel): Network, Storage, DOM, and UI specialists each produce a structured exploration summary.
+
+**Phase 2 — Routine Construction**: PrincipalInvestigator reads summaries, dispatches ExperimentWorker agents, reviews results, assembles routines, submits to RoutineInspector for quality gating. Incremental persistence to disk. PI crash recovery via DiscoveryLedger.
+
+**Data models:**
+- `bluebox/data_models/orchestration/` - `DiscoveryLedger`, `ExperimentEntry`, `RoutineSpec`, `RoutineAttempt`, `RoutineCatalog`, `RoutineInspectionResult`
+- `bluebox/data_models/api_indexing/` - `NetworkExplorationSummary`, `StorageExplorationSummary`, `DOMExplorationSummary`, `UIExplorationSummary`
+
 ### Important Patterns
 
 - **Routine Execution**: Operations execute sequentially, maintaining state via `RoutineExecutionContext`
 - **Placeholder Resolution**: All parameters use `{{paramName}}` format; `Parameter.type` drives coercion at runtime
 - **Session Storage**: Use `session_storage_key` to store and retrieve data between operations
 - **CDP Sessions**: Use flattened sessions for multiplexing via `session_id`
+- **Agent Tools**: Decorate with `@agent_tool()`. Supports `persist` (`NEVER`/`ALWAYS`/`OVERFLOW`), `max_characters`, and `token_optimized` parameters
+- **Agent Card**: Every concrete `AbstractAgent` subclass must declare an `AGENT_CARD`
 
 ### Common Gotchas
 
diff --git a/README.md b/README.md
index f8772c8e..08188f92 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ bluebox-agent --context-file path/to/agent_context.json
 
 ## Create your own routines
 
-To learn about the core technology powering BlueBox, see [routine_discovery.md](routine_discovery.md).
+To learn about the core technology powering BlueBox, see [routine_discovery.md](docs/routine_discovery.md).
 
 ## Contributing 🤝
 
diff --git a/bluebox/agent_docs/common-issues/cors-failed-to-fetch.md b/bluebox/agent_docs/common-issues/cors-failed-to-fetch.md
new file mode 100644
index 00000000..644caeb8
--- /dev/null
+++ b/bluebox/agent_docs/common-issues/cors-failed-to-fetch.md
@@ -0,0 +1,101 @@
+# Fetch Fails with TypeError: Failed to fetch (CORS)
+
+> Fetch operations fail with "TypeError: Failed to fetch" when the browser's origin doesn't match the API server's CORS `Access-Control-Allow-Origin` header. Fix by adding a `navigate` operation to the allowed origin before any `fetch`. Related: [fetch.md](../operations/fetch.md), [navigation.md](../operations/navigation.md)
+
+**Symptom:** Fetch operation returns `TypeError: Failed to fetch` or the response data is `null`/empty despite the endpoint working in experiments.
+
+**Root Cause:** The routine executor starts from `about:blank` (origin = `null`). Many APIs restrict CORS to their own website origin. For example, `api.nasdaq.com` only allows requests from origin `https://www.nasdaq.com`. Without a `navigate` operation first, the browser's origin is `null` and every `fetch` is blocked by CORS.
+
+**How to detect:** If an experiment confirmed the API works from the site's origin (e.g. `browser_eval_js(fetch(...))` succeeded after navigating to `www.example.com`) but the routine's `fetch` operation fails with `TypeError: Failed to fetch`, the routine is missing a `navigate` step.
+
+**Solutions:**
+
+| Problem | Fix |
+|---------|-----|
+| API requires same-origin (e.g. `api.example.com` allows `www.example.com`) | Add `navigate` to the allowed origin before `fetch` |
+| API requires `Origin`/`Referer` headers | Add `"Origin"` and `"Referer"` to fetch headers |
+| API is on the same domain as the website | Add `navigate` to the website URL first |
+| Cloudflare/WAF blocks CORS preflight (OPTIONS → 403) | Set `"credentials": "omit"` on the fetch endpoint — this avoids the preflight OPTIONS request entirely, bypassing the block. Works for public APIs that don't need cookies |
+| All else fails | Use `js_evaluate` with `fetch()` instead of a `fetch` operation — JS fetch from the navigated page context has the correct origin |
+
+**RULE:** Every routine that calls an external API SHOULD start with a `navigate` operation to establish the correct browser origin. This is cheap (one page load) and prevents CORS issues.
+
+**Example: Navigate to allowed origin, then fetch from API subdomain**
+```json
+[
+  {"type": "navigate", "url": "https://www.example.com"},
+  {
+    "type": "fetch",
+    "endpoint": {
+      "url": "https://api.example.com/api/data?q={{query}}",
+      "method": "GET",
+      "headers": {
+        "Accept": "application/json, text/plain, */*"
+      }
+    },
+    "session_storage_key": "result"
+  },
+  {"type": "return", "session_storage_key": "result"}
+]
+```
+
+**Example: Navigate + auth token + data fetch (common pattern)**
+```json
+[
+  {"type": "navigate", "url": "https://www.example.com"},
+  {
+    "type": "fetch",
+    "endpoint": {
+      "url": "https://api.example.com/api/token",
+      "method": "POST",
+      "headers": {"Content-Type": "application/json"},
+      "body": {"applicationName": "web"}
+    },
+    "session_storage_key": "auth_response"
+  },
+  {
+    "type": "js_evaluate",
+    "expression": "(function(){ var r = JSON.parse(sessionStorage.getItem('auth_response')); return r.data.token; })()",
+    "session_storage_key": "bearer_token"
+  },
+  {
+    "type": "fetch",
+    "endpoint": {
+      "url": "https://api.example.com/api/data",
+      "method": "GET",
+      "headers": {
+        "Authorization": "Bearer {{sessionStorage.bearer_token}}",
+        "Accept": "application/json"
+      }
+    },
+    "session_storage_key": "data_result"
+  },
+  {"type": "return", "session_storage_key": "data_result"}
+]
+```
+
+**Cloudflare / WAF Blocking Preflight Requests**
+
+Some APIs behind Cloudflare or other WAFs block CORS preflight (OPTIONS) requests with 403. This happens when `credentials: "include"` triggers a preflight that Cloudflare rejects. The captured network data will show OPTIONS requests returning 403 with `server: cloudflare` and `content-type: text/html`.
+
+**Fix:** If the API does NOT require cookies or session auth, set `"credentials": "omit"` on the fetch endpoint. This tells the browser NOT to send cookies, which often eliminates the preflight OPTIONS request entirely, bypassing the Cloudflare block.
+
+**When to try this:** The experiment shows `TypeError: Failed to fetch` AND the captured network data shows OPTIONS preflight returning 403 from Cloudflare. Try `credentials: "omit"` first — many public search/listing APIs work without cookies.
+
+```json
+[
+  {"type": "navigate", "url": "https://www.example.com"},
+  {
+    "type": "fetch",
+    "endpoint": {
+      "url": "https://api.example.com/search",
+      "method": "POST",
+      "headers": {"Content-Type": "application/json", "Accept": "application/json"},
+      "body": {"query": "{{search_term}}", "page": "{{page}}"},
+      "credentials": "omit"
+    },
+    "session_storage_key": "search_result"
+  },
+  {"type": "return", "session_storage_key": "search_result"}
+]
+```
diff --git a/bluebox/agent_docs/core/auth-token-resolution.md b/bluebox/agent_docs/core/auth-token-resolution.md
new file mode 100644
index 00000000..ffe4532c
--- /dev/null
+++ b/bluebox/agent_docs/core/auth-token-resolution.md
@@ -0,0 +1,302 @@
+# Auth & Token Resolution Strategies
+
+Tokens and API keys are the #1 reason routines fail with 401/403 errors. This guide covers every way to discover, extract, and resolve auth credentials at runtime.
+
+## The Two Categories
+
+| Category | Lifespan | Strategy |
+|----------|----------|----------|
+| **Static credentials** (API keys, subscription keys, client IDs) | Long-lived or permanent | Hardcode in the routine |
+| **Dynamic tokens** (JWT, Bearer, session tokens, CSRF) | Short-lived, expire | Fetch at runtime within the routine |
+
+## Where Tokens Live — Discovery Checklist
+
+When exploring a site's auth, check ALL of these sources. Tokens can come from anywhere.
+
+### 1. Network Requests (Most Common)
+
+The captured session shows exactly which headers and tokens were used.
+
+**How to find them:**
+- Use `capture_search_transactions` to search for keywords: "token", "auth", "key", "bearer", "jwt"
+- Use `capture_get_transaction` to inspect specific request headers
+- Look for `Authorization: Bearer ...` headers
+- Look for custom headers: `Ocp-Apim-Subscription-Key`, `X-Api-Key`, `X-Auth-Token`
+- Look for POST requests to `/token`, `/auth`, `/login`, `/oauth` endpoints
+
+**What you'll find:**
+- The token endpoint URL
+- The exact headers and body needed to get a token
+- The response shape (where the token lives in the JSON response)
+- Any static API keys used alongside the token
+
+### 2. DOM — Inline Scripts and Meta Tags
+
+Sites often embed tokens or config objects directly in the HTML.
+
+**Common patterns:**
+```html
+<!-- Meta tags -->
+<meta name="csrf-token" content="abc123def456">
+<meta name="api-key" content="pk_live_xxxx">
+
+<!-- Inline script config -->
+<script>
+  window.__CONFIG__ = { apiKey: "abc123", authToken: "xyz789" };
+  window.__INITIAL_STATE__ = { auth: { token: "..." } };
+  window.ENV = { API_KEY: "..." };
+</script>
+
+<!-- Data attributes -->
+<div data-api-key="abc123" data-csrf="xyz789"></div>
+```
+
+**Routine resolution:**
+```json
+{"type": "navigate", "url": "https://example.com"},
+{"type": "js_evaluate", "js": "(function() { return { token: document.querySelector('meta[name=\"csrf-token\"]').content }; })();", "session_storage_key": "csrf_data"}
+```
+
+Or use placeholders:
+```json
+"headers": {
+  "X-CSRF-Token": "{{meta:csrf-token}}",
+  "X-Api-Key": "{{windowProperty:__CONFIG__.apiKey}}"
+}
+```
+
+### 3. Browser Storage (localStorage / sessionStorage)
+
+Sites store tokens in browser storage after the user (or the site's JS) authenticates.
+
+**How to discover:**
+- Navigate to the site, then use `js_evaluate` to dump storage:
+```javascript
+(function() {
+  var ss = {};
+  for (var i = 0; i < sessionStorage.length; i++) {
+    var k = sessionStorage.key(i);
+    ss[k] = sessionStorage.getItem(k);
+  }
+  var ls = {};
+  for (var i = 0; i < localStorage.length; i++) {
+    var k = localStorage.key(i);
+    ls[k] = localStorage.getItem(k);
+  }
+  return { sessionStorage: ss, localStorage: ls };
+})()
+```
+
+**Common keys to look for:** `token`, `access_token`, `auth`, `jwt`, `session`, `user`
+
+**Routine resolution:**
+```json
+"headers": {
+  "Authorization": "Bearer {{localStorage:auth.access_token}}",
+  "X-Session": "{{sessionStorage:session.token}}"
+}
+```
+
+### 4. Cookies
+
+Some sites use cookie-based auth — the token IS the cookie.
+
+**How to discover:**
+- Use `get_cookies` operation to see all cookies including HttpOnly ones
+- Look for cookies named: `session`, `token`, `auth`, `sid`, `csrf`, `XSRF-TOKEN`
+
+**Routine resolution — two approaches:**
+
+**a) Let the browser send cookies automatically:**
+```json
+[
+  {"type": "navigate", "url": "https://example.com"},
+  {"type": "sleep", "timeout_seconds": 2.0},
+  {
+    "type": "fetch",
+    "endpoint": {
+      "url": "https://example.com/api/data",
+      "method": "GET",
+      "credentials": "include"
+    }
+  }
+]
+```
+
+**b) Extract cookie value explicitly:**
+```json
+"headers": {
+  "X-XSRF-TOKEN": "{{cookie:XSRF-TOKEN}}"
+}
+```
+
+### 5. Window Properties (JavaScript Globals)
+
+Sites set global JS variables with config/auth info.
+
+**How to discover:**
+```javascript
+(function() {
+  var keys = ['__CONFIG__', '__INITIAL_STATE__', 'ENV', '__NEXT_DATA__',
+              'config', 'appConfig', '__APP_DATA__', '_env'];
+  var found = {};
+  keys.forEach(function(k) {
+    if (window[k]) found[k] = window[k];
+  });
+  return found;
+})()
+```
+
+**Routine resolution:**
+```json
+"headers": {
+  "X-Api-Key": "{{windowProperty:__CONFIG__.apiKey}}"
+}
+```
+
+### 6. API Token Endpoints (Runtime Fetch)
+
+The most robust approach for dynamic tokens — fetch the token at runtime.
+
+**Pattern: fetch token → extract → use in subsequent requests**
+```json
+[
+  {"type": "navigate", "url": "https://example.com", "sleep_after_navigation_seconds": 2.0},
+  {
+    "type": "fetch",
+    "endpoint": {
+      "url": "https://example.com/api/auth/token",
+      "method": "POST",
+      "headers": {
+        "Content-Type": "application/json",
+        "X-Api-Key": "HARDCODED_SITE_KEY_FROM_CAPTURES"
+      },
+      "body": {
+        "applicationName": "website",
+        "channel": "Web"
+      },
+      "credentials": "same-origin"
+    },
+    "session_storage_key": "token_response"
+  },
+  {
+    "type": "fetch",
+    "endpoint": {
+      "url": "https://example.com/api/data",
+      "method": "GET",
+      "headers": {
+        "Authorization": "Bearer {{sessionStorage:token_response.token}}",
+        "X-Api-Key": "HARDCODED_SITE_KEY_FROM_CAPTURES"
+      }
+    },
+    "session_storage_key": "data_result"
+  },
+  {"type": "return", "session_storage_key": "data_result"}
+]
+```
+
+### 7. JS Evaluation (Extract from Running Page)
+
+When tokens are generated by the site's JavaScript and aren't in storage or DOM.
+
+**Pattern: navigate → let site JS run → extract token via JS eval**
+```json
+[
+  {"type": "navigate", "url": "https://example.com", "sleep_after_navigation_seconds": 3.0},
+  {
+    "type": "js_evaluate",
+    "js": "(function() { try { var state = JSON.parse(sessionStorage.getItem('persist:root')); var auth = JSON.parse(state.auth); return { token: auth.accessToken }; } catch(e) { return { error: String(e) }; } })();",
+    "session_storage_key": "extracted_token"
+  },
+  {
+    "type": "fetch",
+    "endpoint": {
+      "url": "https://example.com/api/data",
+      "headers": {
+        "Authorization": "Bearer {{sessionStorage:extracted_token.token}}"
+      }
+    },
+    "session_storage_key": "result"
+  },
+  {"type": "return", "session_storage_key": "result"}
+]
+```
+
+## Experiment Strategies for the PI
+
+When a site requires auth, the PI should dispatch experiments that explore MULTIPLE resolution strategies. Don't just try one approach and give up.
+
+### Experiment 1: Discover What Auth Exists
+
+```
+"Navigate to {site_url}, wait for page load, then inspect ALL available auth sources:
+1. Run JS to dump sessionStorage, localStorage, and window config objects
+2. Use get_cookies to list all cookies
+3. Check DOM for meta tags with csrf/token/key attributes
+4. Use capture_search_transactions to find requests with 'token' or 'auth' in the URL
+
+Report back: what tokens/keys did you find, where did they come from, and what
+do they look like (first 20 chars)? We saw '{observed_token_prefix}...' in the
+captured session — is it still the same or has it changed?"
+```
+
+### Experiment 2: Try Token Endpoint
+
+```
+"The captured session shows a token endpoint at {token_url}.
+1. Use capture_get_transaction to get the EXACT headers and body from the capture
+2. Navigate to {site_url} first to establish cookies
+3. Call the token endpoint with the same headers/body
+4. If it returns a token, store it and try calling {data_endpoint} with
+   Authorization: Bearer {token}
+5. If it fails, try variations: different Content-Type, with/without credentials,
+   with cookies via credentials:'include'
+
+The captured request had these headers: {captured_headers}
+The captured body was: {captured_body}"
+```
+
+### Experiment 3: Try Page-Embedded Token
+
+```
+"Navigate to {site_url} and wait 3 seconds for JS to execute.
+Then try to find auth tokens in the page:
+1. Check window.__CONFIG__, window.__INITIAL_STATE__, window.ENV
+2. Check sessionStorage and localStorage for 'token', 'auth', 'jwt' keys
+3. Check meta tags for csrf-token, api-key
+4. If you find a token, try using it to call {data_endpoint}
+
+In the captured session, we saw a token that looked like: '{token_sample}'
+It may be the same static value or may have changed."
+```
+
+### Experiment 4: Try Cookie-Based Auth
+
+```
+"Navigate to {site_url} and wait for page load.
+The site may use cookie-based auth (the navigation itself establishes the session).
+1. After navigation, call {data_endpoint} with credentials:'include' to send cookies
+2. If that works, the routine just needs navigate + fetch with credentials:'include'
+3. If it fails, dump cookies with get_cookies to see what cookies exist
+4. Try different credential modes: 'same-origin' vs 'include'"
+```
+
+## Common Auth Patterns by Site Type
+
+| Site Type | Typical Auth | Strategy |
+|-----------|-------------|----------|
+| Modern SPA (React/Angular) | JWT via token endpoint | Fetch token → use Bearer header |
+| Traditional server-rendered | Session cookie | Navigate → fetch with `credentials: "include"` |
+| Public API with key | Static API key in header | Hardcode from captures |
+| CSRF-protected forms | CSRF token in meta/cookie | Extract via `{{meta:csrf-token}}` or `{{cookie:XSRF-TOKEN}}` |
+| OAuth-protected | Access token via OAuth flow | Fetch token endpoint with client credentials |
+| Azure API Management | Subscription key + JWT | Hardcode sub key, fetch JWT at runtime |
+
+## Key Rules
+
+1. **Static keys are HARDCODED** — API keys, subscription keys, client IDs from captures go directly into the routine. Never expose them as user parameters.
+2. **Dynamic tokens are FETCHED** — JWT, Bearer, session tokens must be obtained at runtime via a fetch or js_evaluate operation within the routine.
+3. **Always navigate first** — Most auth requires being on the site's origin for cookies and CORS to work.
+4. **Check multiple sources** — A token might be in storage, DOM, cookies, AND network requests. Find the most reliable source.
+5. **Include observed values in experiments** — Tell the worker what the token looked like in the captured session so they know what to look for and can verify if it's static or dynamic.
+6. **The PI must try multiple strategies** — If the token endpoint fails, try page-embedded tokens. If those fail, try cookie-based auth. If that fails, try JS evaluation. Don't give up after one approach.
diff --git a/bluebox/agent_docs/core/naming-conventions.md b/bluebox/agent_docs/core/naming-conventions.md
new file mode 100644
index 00000000..7da58896
--- /dev/null
+++ b/bluebox/agent_docs/core/naming-conventions.md
@@ -0,0 +1,111 @@
+# Routine Naming & Documentation Conventions
+
+Routines are vectorized and stored in databases for other agents to discover via semantic search. Clear, precise metadata is **essential** — a routine with a vague name or missing description is invisible and unusable.
+
+## Routine Name
+
+**Format:** `snake_case` with a `verb_site_noun` pattern and **3+ segments**.
+
+The name MUST include the site or service name so it makes sense in isolation. Another agent reading ONLY the name — with no other context — should know what site this targets and what it does.
+
+| Good | Bad | Why |
+|------|-----|-----|
+| `get_premierleague_standings` | `get_standings` | Standings from where? |
+| `search_premierleague_matches_by_season` | `search_matches` | Which sport? Which site? |
+| `fetch_amtrak_train_schedules` | `get_data` | Completely generic |
+| `download_arxiv_paper_pdf` | `download_paper` | Which paper repository? |
+| `list_espn_upcoming_fixtures` | `list_fixtures` | Which sports platform? |
+| `get_github_repo_stars` | `get_content_item` | Content from where? What item? |
+
+**Rules:**
+- Always start with a verb: `get_`, `search_`, `fetch_`, `list_`, `download_`, `create_`, `submit_`
+- Always include the **site/service name**: `premierleague`, `amtrak`, `arxiv`, `espn`, `github`
+- Include the domain noun: `standings`, `matches`, `flights`, `players`, `teams`
+- Add qualifiers when needed: `_by_season`, `_one_way`, `_with_details`
+- Use `snake_case` only — no camelCase, no spaces, no hyphens
+- Minimum 3 underscore-separated segments: `verb_site_noun`
+
+## Routine Description
+
+**Minimum 8 words.** Must answer three questions:
+
+1. **What does it do?** (the action)
+2. **What inputs does it take?** (the parameters)
+3. **What data does it return?** (the output structure)
+
+### Examples
+
+**Good (all three questions answered):**
+> Fetches Premier League standings for a given competition ID and season ID, returning team names, positions, wins, draws, losses, goals scored, goals conceded, and total points.
+
+> Searches for one-way flights from an origin airport to a destination on a specific date, returning a list of flights with airline, departure time, arrival time, duration, stops, and price.
+
+**Bad (missing information):**
+> "Get standings" — too short, missing input/output info
+> "A routine for the Premier League" — doesn't say what it does or returns
+> "Fetches data from the API" — which API? what data? what format?
+
+### Template
+
+> `{Verb}s {what} for a given {param1} and {param2}, returning {field1}, {field2}, {field3}, and {field4}.`
+
+## Parameter Names
+
+**Format:** `snake_case`, descriptive, never ambiguous.
+
+| Good | Bad | Why |
+|------|-----|-----|
+| `competition_id` | `id` | Ambiguous — id of what? |
+| `season_year` | `year` | Could mean any year |
+| `departure_date` | `date` | Which date? |
+| `team_name` | `name` | Name of what? |
+| `search_query` | `q` | Cryptic |
+| `page_number` | `page` | Acceptable but `page_number` is clearer |
+
+## Parameter Descriptions
+
+**Minimum 3 words.** Must explain:
+1. What the value represents
+2. Expected format or range (when applicable)
+
+### Examples
+
+**Good:**
+> "The unique competition identifier, typically a numeric ID (e.g. 1 for Premier League)"
+> "Departure date in YYYY-MM-DD format (e.g. 2024-12-25)"
+> "Season year as a 4-digit number (e.g. 2024 for the 2024-25 season)"
+
+**Bad:**
+> "ID" — too terse, ambiguous
+> "The season" — doesn't explain format
+> "query" — just restates the parameter name
+
+## Non-Obvious Parameters: Sourcing is MANDATORY
+
+If a parameter value is NOT something a human would naturally know — opaque numeric IDs, internal slugs, encoded tokens, UUIDs — the description **MUST** explain where to get valid values. Without sourcing, the routine is unusable.
+
+**How to identify non-obvious parameters:** names ending in `_id`, `_slug`, `_code`, `_token`, `_key`, `_hash`, or any numeric/integer parameter that represents an internal identifier.
+
+### Examples
+
+**Good (includes sourcing):**
+> "Internal competition ID. Obtain from the get_competitions routine or the /competitions API endpoint. Example: 1 = Premier League, 2 = Championship."
+
+> "Season ID as used by the Premier League API. Use the get_seasons routine to list valid season IDs for a competition. Example: 418 = 2023-24 season."
+
+> "Team slug as it appears in the site URL path (e.g. 'arsenal', 'manchester-united'). Find by calling get_teams or navigating to the team page."
+
+**Bad (no sourcing — where do I get these?):**
+> "The competition ID" — which competition? where do I look it up?
+> "Season identifier" — what values are valid? how do I find them?
+> "Internal team code" — completely opaque, no way to discover valid values
+
+**Rule of thumb:** if you can't google the value, the description must say how to get it.
+
+## Why This Matters
+
+Other agents search the routine database with natural language queries like:
+- "Find me Premier League standings"
+- "Search for flights from LAX to JFK"
+
+If your routine is named `get_data` with description "fetches data", it will never match these queries. But `get_league_standings` with a rich description will rank highly and be selected for execution.
diff --git a/bluebox/agents/abstract_agent.py b/bluebox/agents/abstract_agent.py
index 02e4263a..8fc0d731 100644
--- a/bluebox/agents/abstract_agent.py
+++ b/bluebox/agents/abstract_agent.py
@@ -19,14 +19,20 @@
 
 import json
 import functools
+import re
 from abc import ABC, abstractmethod
 from concurrent.futures import Future, ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, Callable, ClassVar, get_type_hints
+from enum import StrEnum
+from pathlib import Path
+from textwrap import dedent
+from typing import Any, Callable, ClassVar, NamedTuple, get_type_hints
 
-from pydantic import TypeAdapter, ValidationError
+import jsonschema
+from pydantic import BaseModel, TypeAdapter, ValidationError
 
+from bluebox.workspace import AgentWorkspace
 from bluebox.data_models.llms.interaction import (
     BrowserAgentStepEmittedMessage,
     Chat,
@@ -43,16 +49,51 @@
     ToolInvocationStatus,
 )
 from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
+from bluebox.data_models.orchestration.result import SpecialistResultWrapper
 from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader, FileType
 from bluebox.llms.llm_client import LLMClient
 from bluebox.llms.tools.tool_utils import extract_description_from_docstring, generate_parameters_schema
+from bluebox.utils.code_execution_sandbox import (
+    BLOCKED_MODULES,
+    BLOCKED_PATTERNS,
+    execute_python_sandboxed,
+    get_active_sandbox_mode,
+    get_workaround_for_error,
+)
 from bluebox.utils.data_utils import format_bytes
-from bluebox.utils.llm_utils import token_optimized
+from bluebox.utils.llm_utils import token_optimized as token_optimized_decorator
 from bluebox.utils.logger import get_logger
 
 logger = get_logger(name=__name__)
 
 
+class ToolResultPersistMode(StrEnum):
+    NEVER = "never"
+    ALWAYS = "always"
+    OVERFLOW = "overflow"
+
+
+# Keep persisted tool previews small so iterative runs don't blow context.
+PERSISTED_TOOL_PREVIEW_MAX_CHARS = 800
+
+
+class AgentExecutionMode(StrEnum):
+    """Execution mode for agent loops."""
+    CONVERSATIONAL = "conversational"
+    AUTONOMOUS = "autonomous"
+
+
+class AutonomousRunConfig(NamedTuple):
+    """
+    Configuration for autonomous agent runs.
+
+    min_iterations controls when finalize tools become available.
+    max_iterations controls when the autonomous loop gives up.
+    """
+    min_iterations: int = 3
+    max_iterations: int = 10
+
+
 @dataclass(frozen=True)
 class AgentCard:
     """
@@ -72,6 +113,36 @@ class _ToolMeta:
     description: str                                    # tool description shown to the LLM
     parameters: dict[str, Any]                          # JSON Schema for tool parameters
     availability: bool | Callable[..., bool]            # whether the tool should be registered right now
+    persist: ToolResultPersistMode = ToolResultPersistMode.NEVER
+    max_characters: int = 10_000
+    token_optimized: bool = False
+
+
+def _serialize_tool_result(tool_result: Any) -> tuple[str, str]:
+    try:
+        return json.dumps(tool_result, ensure_ascii=False, default=str, indent=2), "json"
+    except (TypeError, ValueError):
+        return str(tool_result), "text"
+
+
+def _normalize_file_scope(scope: str) -> str:
+    """Normalize and validate file tool scope."""
+    normalized_scope = scope.strip().lower()
+    if normalized_scope not in {"workspace", "docs"}:
+        raise ValueError("scope must be 'workspace' or 'docs'")
+    return normalized_scope
+
+
+def _parse_search_terms(query: str) -> list[str]:
+    """Split query text into distinct terms for terms-mode search."""
+    seen: set[str] = set()
+    terms: list[str] = []
+    for token in re.split(r"[,\s]+", query):
+        term = token.strip()
+        if term and term not in seen:
+            seen.add(term)
+            terms.append(term)
+    return terms
 
 
 def agent_tool(
@@ -79,6 +150,9 @@ def agent_tool(
     parameters: dict[str, Any] | None = None,
     *,
     availability: bool | Callable[..., bool] = True,
+    persist: ToolResultPersistMode = ToolResultPersistMode.NEVER,
+    max_characters: int = 10_000,
+    token_optimized: bool = False,
 ) -> Callable:
     """
     Decorator that marks a method as an agent tool handler.
@@ -107,6 +181,12 @@ def _my_tool(self, x: str) -> dict: ...
               tool is available only when it returns True. Use this for tools
               gated behind lifecycle state or dynamic conditions (e.g.
               ``availability=lambda self: self.can_finalize``).
+        persist: Tool-result persistence policy.
+            - ToolResultPersistMode.NEVER (default): never persist.
+            - ToolResultPersistMode.ALWAYS: always persist.
+            - ToolResultPersistMode.OVERFLOW: persist only if result exceeds max_characters.
+        max_characters: Character threshold for OVERFLOW mode.
+        token_optimized: If True, encode tool output with toon for token efficiency.
     """
     def decorator(method: Callable, desc: str | None = None) -> Callable:
         tool_name = method.__name__.lstrip("_")
@@ -125,11 +205,22 @@ def decorator(method: Callable, desc: str | None = None) -> Callable:
         else:
             final_parameters = parameters
 
+        if not isinstance(persist, ToolResultPersistMode):
+            raise ValueError(
+                f"Tool {tool_name} has invalid persist value: {persist!r}. "
+                "Use ToolResultPersistMode values.",
+            )
+        if max_characters <= 0:
+            raise ValueError(f"Tool {tool_name} must have max_characters > 0")
+
         method._tool_meta = _ToolMeta(
             name=tool_name,
             description=final_description,
             parameters=final_parameters,
             availability=availability,
+            persist=persist,
+            max_characters=max_characters,
+            token_optimized=token_optimized,
         )
         return method
 
@@ -162,11 +253,19 @@ class AbstractAgent(ABC):
     # Class-level configuration (can be overridden by subclasses)
     AGENT_LOOP_MAX_ITERATIONS: int = 10
     AGENT_CARD: ClassVar[AgentCard]  # must be defined by every concrete subclass
+    _subclasses: ClassVar[list[type[AbstractAgent]]] = []
+    WORKSPACE_USAGE_SECTION: ClassVar[str] = dedent("""\
+        ## Workspace
+        - Use `raw/` (read-only) for tool-call artifacts (inputs/results), not deliverables.
+        - Write generated deliverables to `output/`.
+        - Store reusable notes/context in `context/`.
+        - `meta/` (read-only) is system-managed and not editable.
+    """)
 
     def __init_subclass__(cls, **kwargs: Any) -> None:
         """Validate that concrete subclasses define AGENT_CARD."""
         super().__init_subclass__(**kwargs)
-        # skip abstract classes (matches existing naming convention in AbstractSpecialist)
+        # skip abstract classes
         if cls.__name__.startswith("Abstract"):
             return
         # validate that the subclass defines an AGENT_CARD class variable of type AgentCard
@@ -174,6 +273,13 @@ def __init_subclass__(cls, **kwargs: Any) -> None:
             raise TypeError(
                 f"{cls.__name__} must define an AGENT_CARD class variable of type AgentCard"
             )
+        if cls not in cls._subclasses:
+            cls._subclasses.append(cls)
+
+    @classmethod
+    def get_all_subclasses(cls) -> list[type[AbstractAgent]]:
+        """Return a copy of all registered concrete AbstractAgent subclasses."""
+        return cls._subclasses.copy()
 
     ## Abstract methods
 
@@ -186,6 +292,7 @@ def _get_system_prompt(self) -> str:
     def __init__(
         self,
         emit_message_callable: Callable[[EmittedMessage], None],
+        workspace: AgentWorkspace | None = None,
         persist_chat_callable: Callable[[Chat], Chat] | None = None,
         persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None,
         stream_chunk_callable: Callable[[str], None] | None = None,
@@ -194,12 +301,17 @@ def __init__(
         existing_chats: list[Chat] | None = None,
         documentation_data_loader: DocumentationDataLoader | None = None,
         on_llm_response: Callable[[LLMChatResponse], None] | None = None,
+        execution_mode: AgentExecutionMode = AgentExecutionMode.CONVERSATIONAL,
+        allow_code_execution: bool = False,
+        code_execution_globals: dict[str, Any] | None = None,
     ) -> None:
         """
         Initialize the agent.
 
         Args:
             emit_message_callable: Callback to emit messages to the host.
+            workspace: Optional workspace used for agent file operations.
+                When omitted, workspace-scoped tools/features are unavailable.
             persist_chat_callable: Optional callback to persist Chat objects.
             persist_chat_thread_callable: Optional callback to persist ChatThread.
             stream_chunk_callable: Optional callback for streaming text chunks.
@@ -208,16 +320,46 @@ def __init__(
             existing_chats: Existing Chat messages if loading from persistence.
             documentation_data_loader: Optional DocumentationDataLoader for docs/code search tools.
             on_llm_response: Optional callback invoked after each LLM call with the response (for token tracking).
+            execution_mode: Agent execution mode (conversational or autonomous).
+            allow_code_execution: Whether to expose the generic execute_python tool.
+            code_execution_globals: Globals injected into execute_python sandbox runs.
+                Must be empty when allow_code_execution is False.
         """
+        normalized_globals = dict(code_execution_globals or {})
+        if not allow_code_execution and normalized_globals:
+            raise ValueError(
+                "code_execution_globals must be empty when allow_code_execution is False",
+            )
+
         self._emit_message_callable = emit_message_callable
         self._persist_chat_callable = persist_chat_callable
         self._persist_chat_thread_callable = persist_chat_thread_callable
         self._stream_chunk_callable = stream_chunk_callable
         self._documentation_data_loader = documentation_data_loader
         self._on_llm_response = on_llm_response
+        self._on_chat_added: Callable[[Chat], None] | None = None
+        self.execution_mode = execution_mode
+        self._autonomous_iteration: int = 0
+        self._autonomous_config: AutonomousRunConfig = AutonomousRunConfig()
+        self._task_output_schema: dict[str, Any] | None = None
+        self._task_output_description: str | None = None
+        self._notes: list[str] = []
+        self._wrapped_result: BaseModel | None = None
+        self._finalize_with_output_failed = False
+        self._allow_code_execution = allow_code_execution
+        self._code_execution_globals = normalized_globals if allow_code_execution else {}
+        self._sandbox_mode = (
+            get_active_sandbox_mode(work_dir_set=workspace is not None)
+            if allow_code_execution
+            else "blocklist"
+        )
         self._previous_response_id: str | None = None
         self._response_id_to_chat_index: dict[str, int] = {}
 
+        if workspace is not None and not isinstance(workspace, AgentWorkspace):
+            raise TypeError("workspace must implement AgentWorkspace when provided")
+        self._workspace = workspace
+
         self.llm_model = llm_model
         self.llm_client = LLMClient(llm_model)
 
@@ -236,6 +378,239 @@ def __init__(
             for chat in existing_chats:
                 self._chats[chat.id] = chat
 
+    @agent_tool(
+        availability=lambda self: self._allow_code_execution,
+        persist=ToolResultPersistMode.OVERFLOW,
+    )
+    def _execute_python(self, code: str) -> dict[str, Any]:
+        """
+        Execute Python code in a sandbox.
+
+        When a workspace is configured, code runs with `work_dir` set to the
+        workspace root so file I/O is scoped to that directory.  All files
+        created or modified anywhere in the workspace are tracked and
+        reported in the response as ``files_created``.
+        Without a workspace, execution is compute-only and file I/O (open/Path)
+        remains blocked by sandbox policy.
+
+        Globals passed via `code_execution_globals` are always available.
+
+        Args:
+            code: Python code to execute.
+        """
+        files_created: list[str] = []
+
+        if self.has_workspace:
+            workspace = self._require_workspace()
+            workspace.ensure_dirs()
+
+            # Snapshot entire workspace before execution for file-tracking
+            files_before = workspace.snapshot_paths(["."])
+
+            sandbox_result = execute_python_sandboxed(
+                code=code,
+                extra_globals=self._code_execution_globals,
+                work_dir=str(workspace.root_path.resolve()),
+                read_only_paths=[
+                    str((workspace.root_path / "raw").resolve()),
+                    str((workspace.root_path / "meta").resolve()),
+                ],
+            )
+
+            # Diff entire workspace to detect created/modified files
+            files_after = workspace.snapshot_paths(["."])
+            delta = workspace.diff_snapshot(files_before, files_after)
+            changed_states = delta.created + delta.modified
+            files_created = [
+                str(workspace.root_path / state.relative_path)
+                for state in changed_states
+            ]
+        else:
+            sandbox_result = execute_python_sandboxed(
+                code=code,
+                extra_globals=self._code_execution_globals,
+            )
+
+        # Build response
+        result: dict[str, Any] = {}
+
+        if "error" in sandbox_result:
+            result["error"] = sandbox_result["error"]
+            workaround = get_workaround_for_error(sandbox_result["error"])
+            if workaround:
+                result["_hint"] = (
+                    f"Sandbox restriction: {workaround} "
+                    "Fix the code and call execute_python again."
+                )
+            elif self.has_workspace:
+                result["_hint"] = (
+                    "Code failed. Read the error and stdout carefully, then fix and retry."
+                )
+
+        output = sandbox_result.get("output", "")
+        if output and output != "(no output)":
+            result["output"] = output
+
+        if files_created:
+            result["files_created"] = files_created
+            result["output_file"] = files_created[0]
+
+        if not result:
+            result["output"] = "(no output)"
+
+        return result
+
+    def _generate_code_execution_prompt(self) -> str:
+        """Generate a prompt section describing the code execution environment.
+
+        Covers sandbox mode (blocklist restrictions vs docker/lambda permissiveness)
+        and any pre-loaded globals available in the sandbox.
+
+        Subclasses can override to add domain-specific guidance, but should call
+        super() to include the base sandbox information.
+        """
+        if not self._allow_code_execution:
+            return ""
+
+        lines: list[str] = ["\n## Code Execution Environment"]
+
+        if not self.has_workspace:
+            lines.append(
+                "\nNo workspace is configured for this agent."
+                "\nExecution is compute-only: filesystem access is disabled."
+                "\n`open()` and `Path` file operations are unavailable."
+            )
+
+        # Globals section
+        if self._code_execution_globals:
+            global_names = ", ".join(f"`{k}`" for k in sorted(self._code_execution_globals))
+            lines.append(
+                f"\nPre-loaded globals available in `execute_python`: {global_names}."
+                "\nUse these directly — do NOT re-import or re-define them."
+            )
+
+        # Sandbox-specific guidance
+        if self._sandbox_mode == "blocklist":
+            blocked_modules_str = ", ".join(sorted(BLOCKED_MODULES))
+            blocked_patterns_str = ", ".join(
+                f"`{p}`" for p, _ in BLOCKED_PATTERNS if p != "open("
+            )
+            path_rule = (
+                "\n- `Path` is already pre-loaded — use it directly, do NOT `import pathlib`"
+                if self.has_workspace
+                else "\n- `Path` is not available without workspace-backed file scope"
+            )
+            open_rule = (
+                "\n- `open()` is already pre-loaded — use it directly for all file I/O"
+                if self.has_workspace
+                else "\n- `open()` is blocked when no workspace is configured"
+            )
+            lines.append(
+                "\n### Sandbox Restrictions (IMPORTANT — read before writing any Python code)"
+                "\nYou are running in restricted sandbox mode."
+                "\n"
+                "\n**Blocked imports** — do NOT import any of these modules:"
+                f"\n{blocked_modules_str}"
+                "\n"
+                "\n**Blocked code patterns** — do NOT use any of these in your code:"
+                f"\n{blocked_patterns_str}"
+                "\n"
+                "\n**Safe imports you CAN use:**"
+                "\n`collections`, `re`, `datetime`, `math`, `itertools`, `functools`,"
+                " `operator`, `string`, `textwrap`, `decimal`, `fractions`,"
+                " `statistics`, `urllib.parse`, `hashlib`, `hmac`, `base64`,"
+                " `copy`, `pprint`, `dataclasses`, `enum`, `typing`"
+                "\n"
+                "\n**Key rules to avoid errors:**"
+                "\n- Do NOT `import os`, `import pathlib`, `import sys`, or any blocked module"
+                f"{path_rule}"
+                f"{open_rule}"
+                "\n- Do NOT use `getattr()` — use dict access: `obj[\"key\"]` or `obj.get(\"key\")`"
+            )
+        else:
+            # Docker or Lambda — more permissive
+            lines.append(
+                f"\nSandbox mode: **{self._sandbox_mode}** (full Python environment available)."
+                "\nYou have access to the standard library and common packages."
+                + (
+                    "\nFile I/O is scoped to the workspace directory."
+                    if self.has_workspace
+                    else "\nNo workspace attached: file I/O is disabled for this run."
+                )
+            )
+
+        return "\n".join(lines)
+
+    def _maybe_persist_tool_result(
+        self,
+        tool_name: str,
+        tool_meta: _ToolMeta,
+        tool_result: Any,
+    ) -> Any:
+        persist_mode = tool_meta.persist
+        if persist_mode == ToolResultPersistMode.NEVER:
+            return tool_result
+
+        serialized, content_type = _serialize_tool_result(tool_result)
+        char_count = len(serialized)
+
+        if persist_mode == ToolResultPersistMode.OVERFLOW and char_count <= tool_meta.max_characters:
+            return tool_result
+
+        safe_tool_name = "".join(c if c.isalnum() or c in ("_", "-") else "_" for c in tool_name)
+        extension = ".json" if content_type == "json" else ".txt"
+        filename = f"{datetime.now().strftime('%Y-%m-%d-%H%M%S-%f')}-{safe_tool_name}_result{extension}"
+        is_truncated = char_count > tool_meta.max_characters
+        preview_limit = min(tool_meta.max_characters, PERSISTED_TOOL_PREVIEW_MAX_CHARS)
+        preview = serialized[:preview_limit]
+        if char_count > preview_limit:
+            preview += f"\n... (preview truncated, {char_count} total chars)"
+
+        workspace = self._workspace
+        if workspace is None:
+            logger.debug(
+                "Skipping persistence for '%s': no workspace configured",
+                tool_name,
+            )
+            return tool_result
+
+        try:
+            workspace.ensure_dirs()
+            ref = workspace.save_artifact(
+                source="raw",
+                filename=filename,
+                content=serialized,
+                tool_name=tool_name,
+                content_type=content_type,
+                metadata={
+                    "persist_mode": persist_mode.value,
+                    "char_count": char_count,
+                    "max_characters": tool_meta.max_characters,
+                    "preview_max_characters": preview_limit,
+                },
+            )
+            logger.debug(
+                "Persisted tool result for '%s' as artifact %s (%s chars)",
+                tool_name,
+                ref.artifact_id,
+                char_count,
+            )
+            return {
+                "tool_name": tool_name,
+                "persist_mode": persist_mode.value,
+                "artifact_id": ref.artifact_id,
+                "artifact_path": ref.relative_path,
+                "truncated": is_truncated,
+                "preview": preview,
+                "_hint": (
+                    "Full tool result saved to workspace raw artifacts. "
+                    f"Read artifact_id={ref.artifact_id} (path: {ref.relative_path}) to inspect complete output."
+                ),
+            }
+        except Exception as e:
+            logger.warning("Failed to persist tool result for '%s': %s", tool_name, e)
+            return tool_result
+
     ## Properties
 
     @property
@@ -243,6 +618,284 @@ def chat_thread_id(self) -> str:
         """Return the current thread ID."""
         return self._thread.id
 
+    @property
+    def has_workspace(self) -> bool:
+        """Whether this agent has an attached workspace."""
+        return self._workspace is not None
+
+    @property
+    def autonomous_iteration(self) -> int:
+        """Return the current/final autonomous iteration count."""
+        return self._autonomous_iteration
+
+    @property
+    def can_finalize(self) -> bool:
+        """
+        Whether finalize tools should be available.
+
+        Finalize tools are available only in autonomous mode after min_iterations.
+        """
+        return (
+            self.execution_mode == AgentExecutionMode.AUTONOMOUS
+            and self._autonomous_iteration >= self._autonomous_config.min_iterations
+        )
+
+    @property
+    def has_output_schema(self) -> bool:
+        """Whether an orchestrator-defined output schema has been set."""
+        return self._task_output_schema is not None
+
+    ## Autonomous extension points
+
+    def _require_workspace(self) -> AgentWorkspace:
+        """Return workspace or raise a clear runtime error when unavailable."""
+        if self._workspace is None:
+            raise RuntimeError(
+                "This agent has no workspace configured. "
+                "Workspace-scoped tools are unavailable.",
+            )
+        return self._workspace
+
+    def _get_autonomous_system_prompt(self) -> str:
+        """
+        Return system prompt for autonomous mode.
+
+        Subclasses can override this for custom autonomous behavior.
+        """
+        return (
+            self._get_system_prompt()
+            + dedent("""
+
+                ## Autonomous Execution
+                - Operate independently and use tools to complete the task.
+                - Keep reasoning concise and tool-driven.
+                - Use `add_note()` for warnings, assumptions, and blockers.
+                - Finalize only via the designated finalize tool.
+            """)
+            + self._get_output_schema_prompt_section()
+            + self._get_urgency_notice()
+        )
+
+    def _get_autonomous_initial_message(self, task: str) -> str:
+        """
+        Build initial USER message for autonomous mode.
+
+        Subclasses can override this for custom autonomous task framing.
+        """
+        finalize_call = (
+            "finalize_with_output(output={...})"
+            if self.has_output_schema
+            else "finalize_result(output={...})"
+        )
+        return dedent(
+            f"""
+            Task: {task}
+
+            Run autonomously until complete.
+            Use available tools to gather evidence and produce the best possible output.
+            When done, call `{finalize_call}`.
+            """
+        ).strip()
+
+    def _check_autonomous_completion(self, tool_name: str) -> bool:
+        """
+        Check whether a tool call signals autonomous completion.
+
+        Default implementation checks generic finalize tool names and
+        requires a wrapped result to be present.
+        """
+        finalize_tools = (
+            "finalize_with_output",
+            "finalize_with_failure",
+            "finalize_result",
+            "finalize_failure",
+        )
+        if tool_name in finalize_tools:
+            return self._wrapped_result is not None
+        return False
+
+    def _get_autonomous_result(self) -> BaseModel | None:
+        """
+        Return autonomous run result.
+
+        Subclasses may override this to map to specialized result objects.
+        """
+        return self._wrapped_result
+
+    def _reset_autonomous_state(self) -> None:
+        """
+        Reset autonomous-mode state before a new run.
+
+        Subclasses can extend this to clear their own autonomous fields.
+        """
+        self._task_output_schema = None
+        self._task_output_description = None
+        self._notes = []
+        self._wrapped_result = None
+        self._finalize_with_output_failed = False
+
+    ## Output schema helpers for autonomous runs
+
+    def set_output_schema(
+        self,
+        schema: dict[str, Any],
+        description: str | None = None,
+    ) -> None:
+        """Set expected output schema for the current autonomous task."""
+        self._task_output_schema = schema
+        self._task_output_description = description
+
+    def _get_output_schema_prompt_section(self) -> str:
+        """Return formatted output schema prompt section for autonomous mode."""
+        if not self._task_output_schema:
+            return ""
+
+        parts = ["\n\n## Expected Output Schema\n"]
+        if self._task_output_description:
+            parts.append(f"**Description:** {self._task_output_description}\n\n")
+        parts.append("**Schema:**\n```json\n")
+        parts.append(json.dumps(self._task_output_schema, indent=2))
+        parts.append("\n```\n")
+        parts.append(
+            "\nWhen ready, call `finalize_with_output(output={...})` with data matching this schema. "
+            "Use `add_note()` before finalizing to record notes, warnings, or errors."
+        )
+        return "".join(parts)
+
+    def _get_urgency_notice(self) -> str:
+        """Iteration-aware urgency notice for autonomous prompts."""
+        finalize_tool = (
+            "`finalize_with_output(output={...})`"
+            if self.has_output_schema
+            else "`finalize_result(output={...})`"
+        )
+        if self.can_finalize:
+            remaining = self._autonomous_config.max_iterations - self._autonomous_iteration
+            if remaining <= 2:
+                return f"\n\n## URGENT: Only {remaining} iteration(s) left — call {finalize_tool} NOW."
+            if remaining <= 4:
+                return f"\n\n## Finalize soon — {remaining} iterations remaining."
+            return f"\n\n## {finalize_tool} is now available."
+        return f"\n\n## Continue exploring (iteration {self._autonomous_iteration})."
+
+    @agent_tool(
+        availability=lambda self: (
+            self.execution_mode == AgentExecutionMode.AUTONOMOUS
+        )
+    )
+    def add_note(self, note: str) -> dict[str, Any]:
+        """
+        Add a note to the autonomous result wrapper.
+
+        Args:
+            note: Note, warning, complaint, or error to include in final output.
+        """
+        self._notes.append(note)
+        return {"status": "ok", "total_notes": len(self._notes)}
+
+    @agent_tool(availability=lambda self: self.can_finalize and self.has_output_schema, token_optimized=True)
+    def _finalize_with_output(self, output: dict[str, Any]) -> dict[str, Any]:
+        """
+        Finalize with output matching the orchestrator's expected schema.
+
+        Args:
+            output: Result data matching the configured JSON schema.
+        """
+        if not self._task_output_schema:
+            return {"error": "No output schema defined for this task"}
+
+        try:
+            jsonschema.validate(instance=output, schema=self._task_output_schema)
+        except jsonschema.ValidationError as e:
+            self._finalize_with_output_failed = True
+            return {
+                "error": "VALIDATION FAILED — output does not match the expected schema.",
+                "validation_error": str(e.message),
+                "schema_path": list(e.absolute_schema_path),
+                "hint": "Fix the output structure and call finalize_with_output again.",
+            }
+
+        self._finalize_with_output_failed = False
+        self._wrapped_result = SpecialistResultWrapper(
+            output=output,
+            success=True,
+            notes=self._notes.copy(),
+            failure_reason=None,
+        )
+        return {
+            "status": "success",
+            "message": "Output validated and stored successfully",
+            "notes_count": len(self._notes),
+        }
+
+    @agent_tool(availability=lambda self: self.can_finalize and self.has_output_schema, token_optimized=True)
+    def _finalize_with_failure(self, reason: str) -> dict[str, Any]:
+        """
+        Finalize with failure when a schema-based task cannot be completed.
+
+        Args:
+            reason: Why the task could not be completed.
+        """
+        if self._finalize_with_output_failed:
+            return {
+                "error": (
+                    "REJECTED — finalize_with_output already failed validation. "
+                    "Fix output and retry finalize_with_output instead of giving up."
+                ),
+            }
+
+        self._wrapped_result = SpecialistResultWrapper(
+            output=None,
+            success=False,
+            notes=self._notes.copy(),
+            failure_reason=reason,
+        )
+        return {
+            "status": "failure",
+            "message": "Task marked as failed",
+            "reason": reason,
+        }
+
+    @agent_tool(availability=lambda self: self.can_finalize and not self.has_output_schema, token_optimized=True)
+    def _finalize_result(self, output: dict[str, Any]) -> dict[str, Any]:
+        """
+        Finalize and submit result data for tasks without a predefined schema.
+
+        Args:
+            output: Result payload.
+        """
+        self._wrapped_result = SpecialistResultWrapper(
+            output=output,
+            success=True,
+            notes=self._notes.copy(),
+            failure_reason=None,
+        )
+        return {
+            "status": "success",
+            "message": "Result submitted successfully",
+            "notes_count": len(self._notes),
+        }
+
+    @agent_tool(availability=lambda self: self.can_finalize and not self.has_output_schema, token_optimized=True)
+    def _finalize_failure(self, reason: str) -> dict[str, Any]:
+        """
+        Finalize with failure for tasks without a predefined schema.
+
+        Args:
+            reason: Why the task could not be completed.
+        """
+        self._wrapped_result = SpecialistResultWrapper(
+            output=None,
+            success=False,
+            notes=self._notes.copy(),
+            failure_reason=reason,
+        )
+        return {
+            "status": "failure",
+            "message": "Task marked as failed",
+            "reason": reason,
+        }
+
     ## Public API
 
     def get_thread(self) -> ChatThread:
@@ -255,6 +908,11 @@ def get_chats(self) -> list[Chat]:
 
     def reset(self) -> None:
         """Reset the conversation to a fresh state."""
+        # Reset autonomous mode state
+        self.execution_mode = AgentExecutionMode.CONVERSATIONAL
+        self._autonomous_iteration = 0
+        self._reset_autonomous_state()
+
         old_chat_thread_id = self._thread.id
         self._thread = ChatThread()
         self._thread_persisted = False
@@ -269,6 +927,45 @@ def reset(self) -> None:
 
     ## Tool registration and dispatch
 
+    def _get_tool_registration_payload(self, tool_meta: _ToolMeta) -> tuple[str, dict[str, Any]]:
+        """
+        Return (description, parameters) used when registering a tool with the LLM.
+
+        Subclasses can override to dynamically customize registration metadata
+        without reimplementing _sync_tools.
+        """
+        description, parameters = tool_meta.description, tool_meta.parameters
+        if tool_meta.name == "finalize_result":
+            description = (
+                "Finalize and submit result data. "
+                "You MUST call this with a non-empty `output` object: "
+                "`finalize_result(output={...})`. Do NOT call with empty arguments."
+            )
+            return description, parameters
+
+        if tool_meta.name != "finalize_with_output" or not self._task_output_schema:
+            return description, parameters
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "output": self._task_output_schema,
+            },
+            "required": ["output"],
+        }
+        desc_suffix = (
+            f" Output description: {self._task_output_description}"
+            if self._task_output_description
+            else ""
+        )
+        description = (
+            "Finalize with output matching the expected schema. "
+            "The output parameter MUST include all required fields "
+            "defined in the schema — do NOT call with empty arguments."
+            + desc_suffix
+        )
+        return description, parameters
+
     def _sync_tools(self) -> None:
         """
         Synchronize tools registered with the LLM client based on current availability.
@@ -288,10 +985,11 @@ def _sync_tools(self) -> None:
             available = tool_meta.availability(self) if callable(tool_meta.availability) else tool_meta.availability
             if not available:
                 continue
+            description, parameters = self._get_tool_registration_payload(tool_meta)
             self.llm_client.register_tool(
                 name=tool_meta.name,
-                description=tool_meta.description,
-                parameters=tool_meta.parameters,
+                description=description,
+                parameters=parameters,
             )
             self._registered_tool_names.add(tool_meta.name)
         logger.debug("Synced %s total tools: %s", len(collected_tools), self._registered_tool_names)
@@ -321,13 +1019,32 @@ def _execute_tool(self, tool_name: str, tool_arguments: dict[str, Any]) -> dict[
 
         # validate required parameters
         required = tool_meta.parameters.get("required", [])
+        valid_params = set(tool_meta.parameters.get("properties", {}).keys())
         missing = [p for p in required if p not in tool_arguments or tool_arguments[p] is None]
+        extra = set(tool_arguments.keys()) - valid_params
+
+        # Auto-wrap: if the tool expects a single "output" dict parameter and the LLM
+        # passed the dict contents as top-level kwargs instead, wrap them automatically.
+        # This is the #1 cause of finalize_with_output / finalize_result failures.
+        if (
+            missing
+            and valid_params == {"output"}
+            and "output" not in tool_arguments
+            and tool_arguments  # LLM passed *something*
+        ):
+            logger.info(
+                "Auto-wrapping %d top-level arg(s) into 'output' for tool '%s'",
+                len(tool_arguments), tool_name,
+            )
+            tool_arguments = {"output": dict(tool_arguments)}
+            missing = []
+            extra = set()
+
         if missing:
-            return {"error": f"Missing required parameter(s): {', '.join(missing)}"}
+            return {"error": f"Missing required parameter(s): {', '.join(missing)}. "
+                    f"Expected parameters: {', '.join(sorted(valid_params))}"}
 
         # validate no extra parameters
-        valid_params = set(tool_meta.parameters.get("properties", {}).keys())
-        extra = set(tool_arguments.keys()) - valid_params
         if extra:
             return {"error": f"Unknown parameter(s) for '{tool_name}': {', '.join(sorted(extra))}"}
 
@@ -343,18 +1060,41 @@ def _execute_tool(self, tool_name: str, tool_arguments: dict[str, Any]) -> dict[
                 else:
                     validated_arguments[param_name] = value
         except ValidationError as e:
-            # extract readable error message
+            # extract readable error message with actionable guidance
             errors = e.errors()
             if errors:
                 err = errors[0]
-                msg = f"{param_name}: expected {err.get('type', 'valid type')}, got {type(value).__name__}"
+                got_type = type(value).__name__
+                expected = err.get("type", "valid type")
+                msg = f"{param_name}: expected {expected}, got {got_type}"
+                # Add actionable hint for common dict vs string confusion
+                if got_type == "str" and "dict" in expected:
+                    msg += (
+                        ". You passed a string but this parameter requires a JSON object. "
+                        'Example: {"key": "value", "nested": {"a": 1}} — NOT a string.'
+                    )
             else:
                 msg = str(e)
             return {"error": f"Invalid argument type: {msg}"}
 
         logger.debug("Executing tool %s with arguments: %s", tool_name, tool_arguments)
         # handler is unbound (from cls, not self) so pass self explicitly
-        return handler(self, **validated_arguments)
+        raw_result = handler(self, **validated_arguments)
+        result_for_llm = self._maybe_persist_tool_result(
+            tool_name=tool_name,
+            tool_meta=tool_meta,
+            tool_result=raw_result,
+        )
+        if tool_meta.token_optimized:
+            if isinstance(result_for_llm, dict) and "artifact_id" in result_for_llm:
+                result_for_llm = {
+                    **result_for_llm,
+                    "_token_optimized_note": (
+                        "This chat output is token-optimized; the saved artifact contains raw output."
+                    ),
+                }
+            return token_optimized_decorator(lambda: result_for_llm)()
+        return result_for_llm
 
     @classmethod
     @functools.lru_cache
@@ -418,97 +1158,299 @@ def _get_documentation_prompt_section(self) -> str:
 
         return "\n".join(lines)
 
+    def _list_workspace_files_scoped(self, path: str) -> dict[str, Any]:
+        """List files under a workspace subpath."""
+        if not self.has_workspace:
+            return {"error": "workspace scope unavailable: no workspace configured"}
+        normalized_path = path.strip() or "."
+        if normalized_path in {".", "./"}:
+            workspace = self._require_workspace()
+            result = workspace.list_files()
+            return {"scope": "workspace", "path": ".", **result}
+
+        workspace_root = self._require_workspace().root_path.resolve()
+        resolved = (workspace_root / normalized_path).resolve()
+        try:
+            resolved.relative_to(workspace_root)
+        except ValueError:
+            return {"error": f"Access denied: '{path}' is outside the workspace directory"}
+        if not resolved.exists():
+            return {"error": f"Path not found: {path}"}
+        if resolved.is_file():
+            return {
+                "scope": "workspace",
+                "path": normalized_path,
+                "type": "file",
+                "total_files": 1,
+                "files": [{
+                    "path": normalized_path,
+                    "size_bytes": resolved.stat().st_size,
+                }],
+            }
+
+        tree_lines: list[str] = [f"{Path(normalized_path).name or normalized_path}/"]
+        total_files = 0
+        for dirpath, dirnames, filenames in sorted(resolved.walk()):
+            rel_dir = dirpath.relative_to(resolved)
+            depth = len(rel_dir.parts)
+            indent = "  " * depth
+            if depth > 0:
+                tree_lines.append(f"{indent}{rel_dir.name}/")
+            dirnames.sort()
+            for filename in sorted(filenames):
+                tree_lines.append(f"{indent}  {filename}")
+                total_files += 1
+
+        return {
+            "scope": "workspace",
+            "path": normalized_path,
+            "tree": "\n".join(tree_lines),
+            "total_files": total_files,
+        }
+
+    def _list_docs_files(self, file_type: str | None, top_n: int) -> dict[str, Any]:
+        """List indexed documentation/code files."""
+        if self._documentation_data_loader is None:
+            return {"error": "docs scope unavailable: documentation_data_loader is not configured"}
+
+        normalized_file_type = file_type.strip().lower() if file_type else None
+        if normalized_file_type and normalized_file_type not in {"documentation", "code"}:
+            return {"error": "file_type must be 'documentation' or 'code'"}
+
+        rows: list[dict[str, Any]] = []
+        for entry in self._documentation_data_loader.entries:
+            if normalized_file_type and entry.file_type.value != normalized_file_type:
+                continue
+            rows.append({
+                "path": str(entry.path),
+                "file_type": entry.file_type.value,
+                "title": entry.title,
+                "summary": entry.summary,
+                "size_bytes": entry.size_bytes,
+            })
+
+        rows.sort(key=lambda row: row["path"])
+        return {
+            "scope": "docs",
+            "file_type": normalized_file_type,
+            "total_files": len(rows),
+            "files": rows[:max(1, top_n)],
+        }
+
+    def _search_workspace_files(
+        self,
+        query: str,
+        mode: str,
+        case_sensitive: bool,
+        top_n: int,
+    ) -> dict[str, Any]:
+        """Search text files in the workspace."""
+        if not self.has_workspace:
+            return {"error": "workspace scope unavailable: no workspace configured"}
+        if not query:
+            return {"error": "query is required"}
+        if mode not in {"exact", "terms", "regex"}:
+            return {"error": "mode must be 'exact', 'terms', or 'regex'"}
+
+        workspace_root = self._require_workspace().root_path.resolve()
+        compiled_regex: re.Pattern[str] | None = None
+        if mode == "regex":
+            flags = 0 if case_sensitive else re.IGNORECASE
+            try:
+                compiled_regex = re.compile(query, flags)
+            except re.error as e:
+                return {"error": f"Invalid regex pattern: {e}"}
+
+        search_query = query if case_sensitive else query.lower()
+        terms = _parse_search_terms(query)
+        normalized_terms = terms if case_sensitive else [t.lower() for t in terms]
+
+        results: list[dict[str, Any]] = []
+        for file_path in workspace_root.rglob("*"):
+            if not file_path.is_file():
+                continue
+            try:
+                if file_path.stat().st_size > 512_000:
+                    continue
+                content = file_path.read_text(encoding="utf-8")
+            except (UnicodeDecodeError, OSError):
+                continue
+            if "\x00" in content:
+                continue
+
+            score = 0
+            matches: list[dict[str, Any]] = []
+            for line_number, line in enumerate(content.splitlines(), start=1):
+                search_line = line if case_sensitive else line.lower()
+                line_hits = 0
+                if mode == "exact":
+                    line_hits = search_line.count(search_query)
+                elif mode == "terms":
+                    for term in normalized_terms:
+                        line_hits += search_line.count(term)
+                else:
+                    assert compiled_regex is not None
+                    line_hits = len(list(compiled_regex.finditer(line)))
+
+                if line_hits > 0:
+                    score += line_hits
+                    matches.append({
+                        "line_number": line_number,
+                        "line_content": line.strip(),
+                        "hits": line_hits,
+                    })
+                if len(matches) >= 10:
+                    break
+
+            if score == 0:
+                continue
+            results.append({
+                "path": str(file_path.relative_to(workspace_root)),
+                "score": score,
+                "matches": matches,
+            })
+
+        results.sort(key=lambda row: row["score"], reverse=True)
+        capped_results = results[:max(1, top_n)]
+        if not capped_results:
+            return {"scope": "workspace", "mode": mode, "query": query, "message": f"No matches found for '{query}'"}
+        return {
+            "scope": "workspace",
+            "mode": mode,
+            "query": query,
+            "files_with_matches": len(capped_results),
+            "results": capped_results,
+        }
+
     @agent_tool(
-        availability=lambda self: self._documentation_data_loader is not None,
+        availability=lambda self: self.has_workspace or self._documentation_data_loader is not None,
+        persist=ToolResultPersistMode.NEVER,
+        token_optimized=True,
         parameters={
             "type": "object",
             "properties": {
-                "query": {
+                "scope": {
+                    "type": "string",
+                    "enum": ["workspace", "docs"],
+                    "description": "Where to list files from.",
+                },
+                "path": {
                     "type": "string",
-                    "description": "The exact string to search for.",
+                    "description": "Workspace path to list (workspace scope only). Defaults to '.'.",
                 },
                 "file_type": {
                     "type": "string",
                     "enum": ["documentation", "code"],
-                    "description": "Optional filter by file type.",
+                    "description": "Optional docs-only file type filter.",
                 },
-                "case_sensitive": {
-                    "type": "boolean",
-                    "description": "Whether the search should be case-sensitive. Defaults to false.",
+                "top_n": {
+                    "type": "integer",
+                    "description": "Max docs files to return. Defaults to 200.",
                 },
             },
-            "required": ["query"],
+            "required": ["scope"],
         },
     )
-    @token_optimized
-    def _search_docs(
+    def _list_files(
         self,
-        query: str,
+        scope: str,
+        path: str = ".",
         file_type: str | None = None,
-        case_sensitive: bool = False,
+        top_n: int = 200,
     ) -> dict[str, Any]:
         """
-        Search documentation/code file contents for an exact query string (like Cmd+F).
-
-        Returns line numbers where matches are found. Use get_doc_file to read around those lines.
+        List files from workspace or docs.
 
         Args:
-            query: The exact string to search for.
-            file_type: Optional filter: 'documentation' for docs, 'code' for source files.
-            case_sensitive: Whether the search should be case-sensitive. Defaults to false.
+            scope: Either 'workspace' or 'docs'.
+            path: Workspace subpath to list when scope='workspace'. Defaults to '.'.
+            file_type: Optional docs-only filter.
+            top_n: Max docs entries to return.
         """
-        if not query:
-            return {"error": "query is required"}
-
-        file_type_enum = FileType(file_type) if file_type else None
-
-        results = self._documentation_data_loader.search_content_with_lines(
-            query=query,
-            file_type=file_type_enum,
-            case_sensitive=case_sensitive,
-            max_matches_per_file=10,
-        )
-
-        if not results:
-            return {"message": f"No matches found for '{query}'", "case_sensitive": case_sensitive}
+        try:
+            normalized_scope = _normalize_file_scope(scope)
+        except ValueError as e:
+            return {"error": str(e)}
 
-        return {
-            "query": query,
-            "case_sensitive": case_sensitive,
-            "files_with_matches": len(results),
-            "results": results[:20],
-        }
+        if normalized_scope == "workspace":
+            return self._list_workspace_files_scoped(path)
+        return self._list_docs_files(file_type=file_type, top_n=top_n)
 
-    @agent_tool(availability=lambda self: self._documentation_data_loader is not None)
-    @token_optimized
-    def _get_doc_file(
+    @agent_tool(
+        availability=lambda self: self.has_workspace or self._documentation_data_loader is not None,
+        persist=ToolResultPersistMode.NEVER,
+        token_optimized=True,
+        parameters={
+            "type": "object",
+            "properties": {
+                "scope": {
+                    "type": "string",
+                    "enum": ["workspace", "docs"],
+                    "description": "Where to read the file from.",
+                },
+                "path": {
+                    "type": "string",
+                    "description": "Path to file. Workspace paths are relative to workspace root.",
+                },
+                "start_line": {
+                    "type": "integer",
+                    "description": "Optional 1-based start line number.",
+                },
+                "end_line": {
+                    "type": "integer",
+                    "description": "Optional 1-based end line number (inclusive).",
+                },
+            },
+            "required": ["scope", "path"],
+        },
+    )
+    def _read_file(
         self,
+        scope: str,
         path: str,
         start_line: int | None = None,
         end_line: int | None = None,
     ) -> dict[str, Any]:
         """
-        Read documentation/code file content by path.
-
-        Supports optional line range. Use start_line/end_line to read around matches from search_docs.
+        Read a file from workspace or docs.
 
         Args:
-            path: The file path (can be partial, will match).
-            start_line: Starting line number (1-indexed, inclusive). Omit for beginning.
-            end_line: Ending line number (1-indexed, inclusive). Omit to read to end.
+            scope: Either 'workspace' or 'docs'.
+            path: Path to file (workspace-relative when scope='workspace').
+            start_line: Optional 1-based start line number.
+            end_line: Optional 1-based end line number (inclusive).
         """
         if not path:
             return {"error": "path is required"}
+        try:
+            normalized_scope = _normalize_file_scope(scope)
+        except ValueError as e:
+            return {"error": str(e)}
+
+        if normalized_scope == "workspace":
+            if not self.has_workspace:
+                return {"error": "workspace scope unavailable: no workspace configured"}
+            workspace = self._require_workspace()
+            return {"scope": "workspace", **workspace.read_file(path, start_line=start_line, end_line=end_line)}
+
+        if self._documentation_data_loader is None:
+            return {"error": "docs scope unavailable: documentation_data_loader is not configured"}
 
         if start_line is not None or end_line is not None:
-            result = self._documentation_data_loader.get_file_lines(
-                path=path, start_line=start_line, end_line=end_line,
-            )
+            result = self._documentation_data_loader.get_file_lines(path, start_line=start_line, end_line=end_line)
             if result is None:
                 return {"error": f"File '{path}' not found"}
-
             content, total_lines = result
+            content_lines = content.count("\n") + (1 if content else 0)
+            read_start = start_line or 1
+            read_end = read_start + max(content_lines - 1, 0)
+            if len(content) > 5_000:
+                content = (
+                    content[:5_000]
+                    + f"\n... [output too large... read lines {read_start} - {read_end}]"
+                )
             return {
+                "scope": "docs",
                 "path": path,
                 "lines_shown": f"{start_line or 1}-{end_line or total_lines}",
                 "total_lines": total_lines,
@@ -521,13 +1463,12 @@ def _get_doc_file(
 
         content = entry.content
         total_lines = content.count("\n") + 1
-
-        if len(content) > 10000:
-            content = content[:10000] + f"\n... (truncated, {len(entry.content)} total chars)"
-
+        if len(content) > 5_000:
+            content = content[:5_000] + f"\n... [output too large... read lines 1 - {total_lines}]"
         return {
+            "scope": "docs",
             "path": str(entry.path),
-            "file_type": entry.file_type,
+            "file_type": entry.file_type.value,
             "title": entry.title,
             "summary": entry.summary,
             "total_lines": total_lines,
@@ -535,72 +1476,145 @@ def _get_doc_file(
         }
 
     @agent_tool(
-        availability=lambda self: self._documentation_data_loader is not None,
+        availability=lambda self: self.has_workspace or self._documentation_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        token_optimized=True,
         parameters={
             "type": "object",
             "properties": {
-                "terms": {
-                    "type": "array",
-                    "items": {"type": "string"},
-                    "description": "List of search terms (case-insensitive).",
+                "scope": {
+                    "type": "string",
+                    "enum": ["workspace", "docs"],
+                    "description": "Where to search files.",
                 },
-                "top_n": {
-                    "type": "integer",
-                    "description": "Number of top results to return. Defaults to 20.",
+                "query": {
+                    "type": "string",
+                    "description": "Search query.",
                 },
-            },
-            "required": ["terms"],
-        },
-    )
-    @token_optimized
-    def _search_docs_by_terms(self, terms: list[str], top_n: int = 20) -> dict[str, Any]:
-        """
-        Search documentation files by multiple terms with relevance scoring.
-
-        Ranks files by how many terms match and total hit count. Good for broad topic searches.
-
-        Args:
-            terms: List of search terms (case-insensitive).
-            top_n: Number of top results to return. Defaults to 20.
-        """
-        if not terms:
-            return {"error": "terms list is required"}
-
-        results = self._documentation_data_loader.search_by_terms(terms=terms, top_n=top_n)
-        return {"terms": terms, "results_count": len(results), "results": results}
-
-    @agent_tool(
-        availability=lambda self: self._documentation_data_loader is not None,
-        parameters={
-            "type": "object",
-            "properties": {
-                "pattern": {
+                "mode": {
                     "type": "string",
-                    "description": "Regex pattern to search for.",
+                    "enum": ["exact", "terms", "regex"],
+                    "description": "Search mode. Defaults to exact.",
+                },
+                "file_type": {
+                    "type": "string",
+                    "enum": ["documentation", "code"],
+                    "description": "Optional docs-only file type filter.",
+                },
+                "case_sensitive": {
+                    "type": "boolean",
+                    "description": "Whether matching is case-sensitive. Defaults to false.",
                 },
                 "top_n": {
                     "type": "integer",
-                    "description": "Max entries to return. Defaults to 20.",
+                    "description": "Maximum number of results to return. Defaults to 20.",
                 },
             },
-            "required": ["pattern"],
+            "required": ["scope", "query"],
         },
     )
-    @token_optimized
-    def _search_docs_by_regex(self, pattern: str, top_n: int = 20) -> dict[str, Any]:
+    def _search_files(
+        self,
+        scope: str,
+        query: str,
+        mode: str = "exact",
+        file_type: str | None = None,
+        case_sensitive: bool = False,
+        top_n: int = 20,
+    ) -> dict[str, Any]:
         """
-        Search documentation files by regex pattern with timeout protection.
-
-        Returns matching snippets with context. Useful for pattern-based searches.
+        Search files in workspace or docs.
 
         Args:
-            pattern: Regex pattern to search for.
-            top_n: Max entries to return. Defaults to 20.
+            scope: Either 'workspace' or 'docs'.
+            query: Query string.
+            mode: Search mode: exact, terms, or regex.
+            file_type: Optional docs-only file type filter.
+            case_sensitive: Whether to match case-sensitively.
+            top_n: Maximum number of results to return.
         """
-        if not pattern:
-            return {"error": "pattern is required"}
+        try:
+            normalized_scope = _normalize_file_scope(scope)
+        except ValueError as e:
+            return {"error": str(e)}
+        normalized_mode = mode.strip().lower()
+
+        if normalized_scope == "workspace":
+            return self._search_workspace_files(
+                query=query,
+                mode=normalized_mode,
+                case_sensitive=case_sensitive,
+                top_n=top_n,
+            )
+
+        if self._documentation_data_loader is None:
+            return {"error": "docs scope unavailable: documentation_data_loader is not configured"}
+        if not query:
+            return {"error": "query is required"}
 
-        return self._documentation_data_loader.search_by_regex(pattern=pattern, top_n=top_n)
+        normalized_file_type = file_type.strip().lower() if file_type else None
+        if normalized_file_type and normalized_file_type not in {"documentation", "code"}:
+            return {"error": "file_type must be 'documentation' or 'code'"}
+        file_type_enum = FileType(normalized_file_type) if normalized_file_type else None
+
+        if normalized_mode == "exact":
+            results = self._documentation_data_loader.search_content_with_lines(
+                query=query,
+                file_type=file_type_enum,
+                case_sensitive=case_sensitive,
+                max_matches_per_file=10,
+            )
+            if not results:
+                return {"scope": "docs", "mode": "exact", "query": query, "message": f"No matches found for '{query}'"}
+            return {
+                "scope": "docs",
+                "mode": "exact",
+                "query": query,
+                "case_sensitive": case_sensitive,
+                "files_with_matches": len(results),
+                "results": results[:max(1, top_n)],
+            }
+
+        if normalized_mode == "terms":
+            terms = _parse_search_terms(query)
+            if not terms:
+                return {"error": "query must contain at least one term for terms mode"}
+            results = self._documentation_data_loader.search_by_terms(terms=terms, top_n=top_n)
+            if normalized_file_type:
+                filtered: list[dict[str, Any]] = []
+                for row in results:
+                    entry_id = row.get("id")
+                    if not entry_id:
+                        continue
+                    entry = self._documentation_data_loader.get_file_by_path(entry_id)
+                    if entry and entry.file_type.value == normalized_file_type:
+                        filtered.append(row)
+                results = filtered
+            return {
+                "scope": "docs",
+                "mode": "terms",
+                "query": query,
+                "terms": terms,
+                "results_count": len(results),
+                "results": results,
+            }
+
+        if normalized_mode == "regex":
+            regex_results = self._documentation_data_loader.search_by_regex(pattern=query, top_n=top_n)
+            if normalized_file_type and "matches" in regex_results:
+                matches = regex_results.get("matches", [])
+                filtered_matches: list[dict[str, Any]] = []
+                for match in matches:
+                    entry_id = match.get("id")
+                    if not entry_id:
+                        continue
+                    entry = self._documentation_data_loader.get_file_by_path(entry_id)
+                    if entry and entry.file_type.value == normalized_file_type:
+                        filtered_matches.append(match)
+                regex_results["matches"] = filtered_matches
+            return {"scope": "docs", "mode": "regex", "query": query, **regex_results}
+
+        return {"error": "mode must be 'exact', 'terms', or 'regex'"}
 
     ## Tool availability prompt section
 
@@ -628,6 +1642,51 @@ def _get_tool_availability_prompt_section(self) -> str:
 
         return "\n".join(lines)
 
+    def _get_workspace_usage_prompt_section(self) -> str:
+        """
+        Build a concise workspace usage guide for the system prompt.
+
+        Uses class-level WORKSPACE_USAGE_SECTION so specialized agents can
+        override the full section text.
+        """
+        if not self.has_workspace:
+            return ""
+        section = self.WORKSPACE_USAGE_SECTION.strip()
+        mounted_section = self._get_mounted_inputs_prompt_section()
+        if not section and not mounted_section:
+            return ""
+
+        parts: list[str] = []
+        if section:
+            parts.append(section)
+        if mounted_section:
+            parts.append(mounted_section)
+
+        return "\n\n" + "\n\n".join(parts)
+
+    def _get_mounted_inputs_prompt_section(self) -> str:
+        """Build a system prompt section listing currently mounted input files."""
+        if not self.has_workspace:
+            return ""
+        workspace = self._require_workspace()
+        mounted_inputs = workspace.list_mounted_inputs()
+        if not mounted_inputs:
+            return ""
+
+        lines = [
+            "## Mounted Input Files (read-only)",
+            "The following files are mounted under `raw/` and can be read directly without tools:",
+            "```python",
+            "text = open('raw/<file>', encoding='utf-8').read()",
+            "blob = open('raw/<file>', 'rb').read()",
+            "```",
+        ]
+        for item in mounted_inputs:
+            lines.append(
+                f"- `{item.relative_path}` — {item.size_bytes} bytes (source: `{item.source_path}`)"
+            )
+        return "\n".join(lines)
+
     ## LLMs and streaming
 
     def _call_llm(
@@ -653,6 +1712,11 @@ def _call_llm(
         # append tool availability (injected here so subclasses can't accidentally omit it)
         system_prompt = system_prompt + self._get_tool_availability_prompt_section()
 
+        # append workspace usage guide (injected centrally for all agents)
+        workspace_section = self._get_workspace_usage_prompt_section()
+        if workspace_section:
+            system_prompt = system_prompt + workspace_section
+
         # append documentation context (injected here so subclasses can't accidentally omit it)
         docs_section = self._get_documentation_prompt_section()
         if docs_section:
@@ -752,6 +1816,9 @@ def _add_chat(
         if self._persist_chat_thread_callable:
             self._thread = self._persist_chat_thread_callable(self._thread)
 
+        if self._on_chat_added is not None:
+            self._on_chat_added(chat)
+
         return chat
 
     def _build_messages_for_llm(self) -> list[dict[str, Any]]:
@@ -872,6 +1939,119 @@ def execute_one(tc: LLMToolCall) -> tuple[LLMToolCall, str]:
                 tool_call_id=tool_call.call_id,
             )
 
+    ## autonomous loop
+
+    def run_autonomous(
+        self,
+        task: str,
+        config: AutonomousRunConfig | None = None,
+        output_schema: dict[str, Any] | None = None,
+        output_description: str | None = None,
+    ) -> BaseModel | None:
+        """
+        Run the agent autonomously to completion.
+
+        Subclasses may override autonomous extension hooks for domain-specific
+        prompting and completion behavior.
+        """
+        self.execution_mode = AgentExecutionMode.AUTONOMOUS
+        try:
+            self._autonomous_iteration = 0
+            self._autonomous_config = config or AutonomousRunConfig()
+
+            # Subclasses should clear specialized result fields in overrides.
+            self._reset_autonomous_state()
+
+            # Set output schema after reset so it is retained for this run.
+            if output_schema:
+                self.set_output_schema(output_schema, output_description)
+
+            initial_message = self._get_autonomous_initial_message(task)
+            self._add_chat(ChatRole.USER, initial_message)
+
+            logger.info(
+                "Starting %s autonomous run for task: %s",
+                self.__class__.__name__, task,
+            )
+            self._run_autonomous_loop()
+            return self._get_autonomous_result()
+        finally:
+            self.execution_mode = AgentExecutionMode.CONVERSATIONAL
+
+    def _run_autonomous_loop(self) -> None:
+        """Run the autonomous loop with iteration tracking and finalize gating."""
+        max_iterations = self._autonomous_config.max_iterations
+        for iteration in range(max_iterations):
+            self._autonomous_iteration = iteration + 1
+            logger.debug(
+                "Autonomous loop iteration %d/%d",
+                self._autonomous_iteration,
+                max_iterations,
+            )
+
+            messages = self._build_messages_for_llm()
+            try:
+                response = self._call_llm(
+                    messages,
+                    self._get_autonomous_system_prompt(),
+                    tool_choice="required",
+                )
+
+                if response.response_id:
+                    self._previous_response_id = response.response_id
+
+                if response.content or response.tool_calls:
+                    chat = self._add_chat(
+                        role=ChatRole.ASSISTANT,
+                        content=response.content or "",
+                        tool_calls=response.tool_calls if response.tool_calls else None,
+                        llm_provider_response_id=response.response_id,
+                    )
+                    if response.content:
+                        self._emit_message(
+                            ChatResponseEmittedMessage(
+                                content=response.content,
+                                chat_id=chat.id,
+                                chat_thread_id=self._thread.id,
+                            )
+                        )
+
+                if not response.tool_calls:
+                    logger.warning(
+                        "Autonomous loop: no tool calls in iteration %d (unexpected with tool_choice=required)",
+                        self._autonomous_iteration,
+                    )
+                    return
+
+                for tool_call in response.tool_calls:
+                    result_str = self._auto_execute_tool(
+                        tool_call.tool_name,
+                        tool_call.tool_arguments,
+                    )
+
+                    self._add_chat(
+                        role=ChatRole.TOOL,
+                        content=f"Tool '{tool_call.tool_name}' result: {result_str}",
+                        tool_call_id=tool_call.call_id,
+                    )
+
+                    if self._check_autonomous_completion(tool_call.tool_name):
+                        logger.debug(
+                            "Autonomous run completed at iteration %d",
+                            self._autonomous_iteration,
+                        )
+                        return
+
+            except Exception as e:
+                logger.exception("Error in autonomous loop: %s", e)
+                self._emit_message(ErrorEmittedMessage(error=str(e)))
+                return
+
+        logger.warning(
+            "Autonomous loop hit max iterations (%d) without finalization",
+            max_iterations,
+        )
+
     ## agent loop (basic implementation, can be overridden)
 
     def _run_agent_loop(self) -> None:
diff --git a/bluebox/agents/bluebox_agent.py b/bluebox/agents/bluebox_agent.py
index 0364ab8e..05772999 100644
--- a/bluebox/agents/bluebox_agent.py
+++ b/bluebox/agents/bluebox_agent.py
@@ -22,7 +22,7 @@
 import requests
 
 from bluebox.agents.abstract_agent import AbstractAgent, AgentCard, agent_tool
-from bluebox.agents.workspace import AgentWorkspace, LocalWorkspace
+from bluebox.workspace import AgentWorkspace
 from bluebox.config import Config
 from bluebox.data_models.agents.context import BlueBoxAgentContext, UsedRoutine
 from bluebox.data_models.browser_agent import (
@@ -43,14 +43,6 @@
 )
 from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
 from bluebox.data_models.routine.routine import RoutineExecutionRequest, RoutineInfo
-from bluebox.utils.code_execution_sandbox import (
-    BLOCKED_MODULES,
-    BLOCKED_PATTERNS,
-    execute_python_sandboxed,
-    get_active_sandbox_mode,
-    get_workaround_for_error,
-)
-from bluebox.utils.llm_utils import token_optimized
 from bluebox.utils.logger import get_logger
 
 logger = get_logger(name=__name__)
@@ -80,32 +72,28 @@ class BlueBoxAgent(AbstractAgent):
         1. **Search broadly**: When the user makes a request, use `search_routines` with a task description that describes what the user wants to do. This runs semantic search, so add some detail. You can run this multiple times if needed to get more results.
         2. **Execute all relevant routines**: Run ALL routines that could plausibly fulfill the user's request via `execute_routines_in_parallel`. When in doubt, include the routine — running an extra routine is cheap, missing a relevant one is costly. Each routine execution requires a `routine_id` from the search results and a `parameters` dict keyed by parameter name with the corresponding value (e.g. {"origin": "New York", "date": "2025-03-01"}). Make sure to provide all required parameters as listed in the search results.
         3. **Fallback to browser agent**: If NO routines match after thorough searching, use `execute_browser_task` to perform the task via an AI-driven browser agent. Write a clear, detailed natural language instruction for the task.
-        4. **Post-process results**: Use `run_python_code` to transform routine results into clean output files (CSV, JSON, JSONL, etc.) for the user.
-        5. **Verify output**: After writing files, use `list_workspace_files` and `read_workspace_file` to verify the output looks correct. If it doesn't, fix the code and rerun.
+        4. **Post-process results**: Use `execute_python` to transform routine results into clean output files (CSV, JSON, JSONL, etc.) for the user.
+        5. **Verify output**: After writing files, use `list_files(scope="workspace")` and `read_file(scope="workspace", path=...)` to verify the output looks correct. If it doesn't, fix the code and rerun.
         6. **Report results**: Summarize what was executed and the output files to the user.
 
         ## Workspace
         Your workspace has the following structure:
-        - `raw/` — routine result JSON files, saved automatically when routines execute
-        - `outputs/` — write all your generated output files here (CSV, JSON, JSONL, etc.)
+        - `raw/` (read-only) — routine result JSON files and mounted inputs
+        - `output/` — write all your generated output files here (CSV, JSON, JSONL, etc.)
         - `context/` — context files (JSON + Markdown) saved by `generate_context`, used for session replay
+        - `meta/` (read-only) — system-managed manifests and metadata
 
-        **Pre-loaded variables in `run_python_code`:**
-        - `routine_results` — list of dicts, one per JSON file in raw/
-        - `json` — for parsing and serialization
-        - `csv` — for CSV reading/writing
-        - `Path` — from pathlib, for path operations (do NOT import pathlib — use `Path` directly)
-        - `open()` — scoped to the workspace directory for safe file I/O
+        **Reading routine outputs in `execute_python`:**
+        - Use `list_files(scope="workspace")` to see files in `raw/`
+        - Read raw JSON files directly in Python:
+          `records = [json.loads(p.read_text()) for p in Path("raw").glob("*.json")]`
+        - Use `read_file(scope="workspace", path="...")` to inspect any file by relative path (e.g. "raw/25-01-15-143052-routine_result_1.json" or "output/results.csv"). Use optional start_line/end_line for large files.
 
         **Writing output files:**
-        - Write to the outputs/ subdirectory: `with open("outputs/results.csv", "w") as f: ...`
-
-        **Inspecting files:**
-        - Use `list_workspace_files` to see all files in the workspace
-        - Use `read_workspace_file` to read any file by relative path (e.g. "raw/25-01-15-143052-routine_result_1.json" or "outputs/results.csv"). Use optional start_line/end_line for large files.
+        - Write to the output/ subdirectory: `with open("output/results.csv", "w") as f: ...`
 
         ## Routine Result Structure
-        Each entry in `routine_results` is the raw API response JSON saved by `execute_routines_in_parallel`. The structure is:
+        Each JSON file in `raw/` from `execute_routines_in_parallel` has this structure:
 
         ```
         {
@@ -122,16 +110,16 @@ class BlueBoxAgent(AbstractAgent):
         }
         ```
 
-        **Path to the payload:** `rr["result"]["data"]` for each `rr` in `routine_results`.
-        **Input parameters:** `rr["parameters"]` for each `rr` in `routine_results`.
+        **Path to the payload:** `record["result"]["data"]`.
+        **Input parameters:** `record["parameters"]`.
 
-        **Important:** The payload shape varies per routine — different routines return different key names and structures. Always start your post-processing code by printing `rr["routine_name"]` and `rr["result"]["data"].keys()` to understand what each routine returned before trying to extract specific fields.
+        **Important:** The payload shape varies per routine — different routines return different key names and structures. Always inspect a few raw records first before extracting fields.
 
         ## Post-Processing with Python
-        - After routines return results, ALWAYS use `run_python_code` to post-process data and generate clean output files.
+        - After routines return results, ALWAYS use `execute_python` to post-process data and generate clean output files.
         - **ALWAYS add debug print() statements** in your code so you can see what's happening: print key counts, data shapes, sample values, etc. stdout is captured and returned to you.
-        - **On first pass, always explore the data**: before writing any output file, print the routine names and top-level keys of each result's payload so you understand the shape. Then write extraction code.
-        - **Be persistent**: If your code errors or produces unexpected results, read the error/output carefully, use `list_workspace_files` and `read_workspace_file` to inspect the data, fix the code, and try again. Keep iterating until you produce the correct output file. NEVER give up after one failed attempt — debug and retry.
+        - **On first pass, always explore the data**: before writing any output file, load records from `raw/*.json`, print routine names and top-level keys, then write extraction code.
+        - **Be persistent**: If your code errors or produces unexpected results, read the error/output carefully, use `list_files(scope="workspace")` and `read_file(scope="workspace", path=...)` to inspect the data, fix the code, and try again. Keep iterating until you produce the correct output file. NEVER give up after one failed attempt — debug and retry.
 
         ## Important Rules
         - **Always prefer routines over `execute_browser_task`**. Routines are faster, cheaper, and more reliable. Only use the browser agent as a fallback when no suitable routine exists.
@@ -146,13 +134,13 @@ class BlueBoxAgent(AbstractAgent):
     def __init__(
         self,
         emit_message_callable: Callable[[EmittedMessage], None],
+        workspace: AgentWorkspace,
         persist_chat_callable: Callable[[Chat], Chat] | None = None,
         persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None,
         stream_chunk_callable: Callable[[str], None] | None = None,
         llm_model: LLMModel = OpenAIModel.GPT_5_2,
         chat_thread: ChatThread | None = None,
         existing_chats: list[Chat] | None = None,
-        workspace: AgentWorkspace | None = None,
         auth_headers_provider: Callable[[], dict[str, str]] | None = None,
         on_llm_response: Callable[[LLMChatResponse], None] | None = None,
         context_file: str | None = None,
@@ -168,7 +156,7 @@ def __init__(
             llm_model: The LLM model to use for conversation.
             chat_thread: Existing ChatThread to continue, or None for new conversation.
             existing_chats: Existing Chat messages if loading from persistence.
-            workspace: Workspace for file I/O. Defaults to LocalWorkspace if not provided.
+            workspace: Workspace for file I/O.
             auth_headers_provider: Optional callback that returns auth headers for
                 downstream API calls. If not provided, falls back to Config.VECTORLY_SERVICE_TOKEN.
             on_llm_response: Optional callback invoked after each LLM call with the response (for token tracking).
@@ -181,7 +169,7 @@ def __init__(
         if not auth_headers_provider and not Config.VECTORLY_SERVICE_TOKEN:
             raise ValueError("Either auth_headers_provider or VECTORLY_SERVICE_TOKEN must be provided")
 
-        self._workspace = workspace or LocalWorkspace()
+        self._workspace = workspace
         self._routine_cache: dict[str, RoutineInfo] = {}
         self._routine_execution_counter = itertools.count(
             self._get_next_routine_result_index()
@@ -192,6 +180,7 @@ def __init__(
 
         super().__init__(
             emit_message_callable=emit_message_callable,
+            workspace=self._workspace,
             persist_chat_callable=persist_chat_callable,
             persist_chat_thread_callable=persist_chat_thread_callable,
             stream_chunk_callable=stream_chunk_callable,
@@ -200,12 +189,9 @@ def __init__(
             existing_chats=existing_chats,
             documentation_data_loader=None,
             on_llm_response=on_llm_response,
+            allow_code_execution=True,
         )
 
-        # Detect sandbox mode once (work_dir is always set for BlueBoxAgent)
-        self._sandbox_mode = get_active_sandbox_mode(work_dir_set=True)
-        self._is_blocklist_mode = self._sandbox_mode == "blocklist"
-
         logger.debug(
             "BlueBoxAgent initialized with model: %s, chat_thread_id: %s, sandbox_mode: %s, has_context: %s",
             llm_model,
@@ -237,41 +223,11 @@ def _get_system_prompt(self) -> str:
         now = datetime.now()
         time_info = f"\n\n## Current Time\n{now.strftime('%Y-%m-%d %H:%M:%S %Z').strip()}"
         prompt = self.SYSTEM_PROMPT + time_info
-        if self._is_blocklist_mode:
-            prompt += self._get_blocklist_sandbox_prompt_section()
+        prompt += self._generate_code_execution_prompt()
         if self._agent_context:
             prompt += self._get_context_prompt_section()
         return prompt
 
-    def _get_blocklist_sandbox_prompt_section(self) -> str:
-        """Build prompt section explaining blocklist sandbox restrictions."""
-        blocked_modules_str = ", ".join(sorted(BLOCKED_MODULES))
-        # Exclude open( from blocked patterns list since it IS available with workspace
-        blocked_patterns_str = ", ".join(
-            f"`{p}`" for p, _ in BLOCKED_PATTERNS if p != "open("
-        )
-
-        return dedent(f"""
-
-            ## Sandbox Restrictions (IMPORTANT — read before writing any Python code)
-            You are running in restricted sandbox mode. Your `run_python_code` calls have strict restrictions.
-
-            **Blocked imports** — do NOT import any of these modules:
-            {blocked_modules_str}
-
-            **Blocked code patterns** — do NOT use any of these in your code:
-            {blocked_patterns_str}
-
-            **Safe imports you CAN use:**
-            `collections`, `re`, `datetime`, `math`, `itertools`, `functools`, `operator`, `string`, `textwrap`, `decimal`, `fractions`, `statistics`, `urllib.parse`, `hashlib`, `hmac`, `base64`, `copy`, `pprint`, `dataclasses`, `enum`, `typing`
-
-            **Key rules to avoid errors:**
-            - Do NOT `import os`, `import pathlib`, `import sys`, or any blocked module
-            - `Path` is already pre-loaded — use it directly, do NOT `import pathlib`
-            - `open()` is already pre-loaded — use it directly for all file I/O
-            - Do NOT use `getattr()` — use dict access: `obj["key"]` or `obj.get("key")`
-        """).rstrip()
-
     ## Routine cache
 
     def _cache_routines_from_response(self, response: dict[str, Any] | list[Any]) -> None:
@@ -393,7 +349,7 @@ def _get_context_prompt_section(self) -> str:
 
         if len(section) > self._CONTEXT_PROMPT_MAX_CHARS:
             section = section[:self._CONTEXT_PROMPT_MAX_CHARS] + (
-                "\n\n... (context truncated — use `read_workspace_file` to read "
+                "\n\n... (context truncated — use `read_file(scope=\"workspace\", path=\"...\")` to read "
                 "the full context files in `context/` for more detail)"
             )
 
@@ -406,7 +362,19 @@ def _extract_routines_from_raw(self) -> list[UsedRoutine]:
         and status from a previous execution. Returns deduplicated list
         of successfully executed routines.
         """
-        raw_results = self._workspace.load_raw_json()
+        raw_results: list[dict[str, Any]] = []
+        raw_refs = sorted(
+            (ref for ref in self._workspace.list_artifacts("raw") if ref.relative_path.endswith(".json")),
+            key=lambda ref: ref.index,
+        )
+        for ref in raw_refs:
+            try:
+                file_data = self._workspace.read_file(ref.relative_path)
+                content = file_data.get("content")
+                if isinstance(content, str):
+                    raw_results.append(json.loads(content))
+            except Exception as e:
+                logger.warning("Failed to parse raw JSON artifact %s: %s", ref.relative_path, e)
         seen: set[str] = set()
         routines: list[UsedRoutine] = []
         for rr in raw_results:
@@ -426,8 +394,7 @@ def _extract_routines_from_raw(self) -> list[UsedRoutine]:
 
     ## Tool handlers
 
-    @agent_tool()
-    @token_optimized
+    @agent_tool(token_optimized=True)
     def _search_routines(self, task: str) -> dict[str, Any]:
         """
         Search for routines by keywords. Matches against routine name and description.
@@ -485,11 +452,17 @@ def save_result(result: dict[str, Any]) -> dict[str, Any]:
             try:
                 idx = next(self._routine_execution_counter)
                 ts = datetime.now().strftime("%y-%m-%d-%H%M%S")
-                save_info = self._workspace.save_file(
-                    "raw", f"{ts}-routine_result_{idx}.json",
+                ref = self._workspace.save_artifact(
+                    "raw",
+                    f"{ts}-routine_result_{idx}.json",
                     json.dumps(result, indent=2, default=str),
                 )
-                result.update(save_info)
+                result.update(
+                    {
+                        "output_file": str(self._workspace.root_path / ref.relative_path),
+                        "artifact_id": ref.artifact_id,
+                    },
+                )
             except Exception as e:
                 logger.exception("Failed to save routine result to file: %s", e)
                 result["output_file_error"] = str(e)
@@ -523,8 +496,7 @@ def _summarize_result(full_result: dict[str, Any], req: RoutineExecutionRequest)
                 summary["_hint"] = (
                     f"Response truncated ({len(raw)} chars). "
                     f"Full result saved to {full_result.get('output_file')}. "
-                    "Use read_workspace_file to inspect the full data, or access it "
-                    "via routine_results in run_python_code."
+                    "Use read_file(scope='workspace', path='...') to inspect the full data, or execute_python to parse it."
                 )
             else:
                 summary["response_preview"] = raw
@@ -631,15 +603,22 @@ def _execute_browser_task(
             logger.error("Browser agent API call failed: %s", e)
             return {"error": f"Browser agent request failed: {e}"}
 
-        # Save final_result as a markdown file in outputs/
+        # Save final_result as a markdown file in output/
         final_result = result.get("final_result")
         if final_result:
             try:
                 ts = datetime.now().strftime("%y-%m-%d-%H%M%S")
-                save_info = self._workspace.save_file(
-                    "outputs", f"{ts}-browser_agent.md", final_result,
+                ref = self._workspace.save_artifact(
+                    "output",
+                    f"{ts}-browser_agent.md",
+                    final_result,
+                )
+                result.update(
+                    {
+                        "output_file": str(self._workspace.root_path / ref.relative_path),
+                        "artifact_id": ref.artifact_id,
+                    },
                 )
-                result.update(save_info)
             except Exception as e:
                 logger.exception("Failed to save browser agent result: %s", e)
                 result["output_file_error"] = str(e)
@@ -709,118 +688,6 @@ def _consume_sse_stream(self, response: requests.Response) -> dict[str, Any]:
 
         return result
 
-    @agent_tool()
-    def _run_python_code(self, code: str) -> dict[str, Any]:
-        """
-        Execute Python code to post-process routine results and generate output files.
-
-        The code runs with full read/write access to the workspace directory.
-        Pre-loaded variables: `routine_results` (list of dicts from all JSON files
-        in the raw/ directory), `json`, `csv`, and `Path` (pathlib.Path).
-
-        Write output files to the outputs/ subdirectory:
-            with open("outputs/results.csv", "w") as f: ...
-
-        IMPORTANT: Always include print() statements for debugging — print data shapes,
-        key names, row counts, sample values, etc. If the code fails, use the output
-        to diagnose and fix. Keep iterating until the output file is correct.
-
-        Args:
-            code: Python code to execute. Has full file access to the workspace.
-                Pre-loaded: routine_results (list[dict]), json, csv, Path.
-                Write output files to outputs/ subdirectory. Always add print()
-                statements for debugging.
-        """
-        # Ensure directories exist
-        self._workspace.ensure_dirs()
-        work_dir = str(self._workspace.root_path.resolve())
-
-        # Snapshot files in outputs/ before execution
-        files_before = self._workspace.snapshot_outputs()
-
-        # Load all JSON files from raw/ as routine_results
-        routine_results = self._workspace.load_raw_json()
-
-        # Execute in sandbox with work_dir for file access
-        sandbox_result = execute_python_sandboxed(
-            code,
-            extra_globals={"routine_results": routine_results},
-            work_dir=work_dir,
-        )
-
-        # Diff files in outputs/ to find new/modified ones
-        files_created = self._workspace.diff_outputs(files_before)
-
-        # Build response
-        result: dict[str, Any] = {}
-
-        if "error" in sandbox_result:
-            result["error"] = sandbox_result["error"]
-            workaround = get_workaround_for_error(sandbox_result["error"])
-            if workaround:
-                result["_hint"] = (
-                    f"Sandbox restriction: {workaround} "
-                    "Fix the code and call run_python_code again."
-                )
-            else:
-                result["_hint"] = (
-                    "Code failed. Read the error and stdout above carefully. "
-                    "Use list_workspace_files and read_workspace_file to inspect the data, "
-                    "then fix the code and call run_python_code again."
-                )
-
-        output = sandbox_result.get("output", "")
-        if output and output != "(no output)":
-            result["output"] = output
-
-        if files_created:
-            result["files_created"] = files_created
-            result["output_file"] = files_created[0]
-            result["_hint"] = (
-                "Files were created. Use read_workspace_file to verify the output "
-                "is correct (check first few lines). If not, fix the code and rerun."
-            )
-        elif "error" not in sandbox_result:
-            result["output"] = result.get("output", "") or "Code ran but produced no files."
-            result["_hint"] = (
-                "No files were created in outputs/. Make sure your code writes to "
-                "outputs/ (e.g. open('outputs/results.csv', 'w')). Fix and rerun."
-            )
-
-        return result
-
-    @agent_tool()
-    @token_optimized
-    def _list_workspace_files(self) -> dict[str, Any]:
-        """
-        List all files in the workspace directory as a tree.
-
-        Shows the full directory structure including raw/ (routine results)
-        and outputs/ (generated files).
-        """
-        return self._workspace.list_files()
-
-    @agent_tool()
-    def _read_workspace_file(
-        self,
-        path: str,
-        start_line: int | None = None,
-        end_line: int | None = None,
-    ) -> dict[str, Any]:
-        """
-        Read a file from the workspace by relative path.
-
-        Use this to inspect raw routine results, verify generated output files,
-        or debug data issues. Supports optional line ranges for large files.
-
-        Args:
-            path: Relative path within the workspace (e.g. "raw/routine_results_2024.json"
-                or "outputs/results.csv").
-            start_line: Optional 1-based start line number. Omit to read from the beginning.
-            end_line: Optional 1-based end line number (inclusive). Omit to read to the end.
-        """
-        return self._workspace.read_file(path, start_line=start_line, end_line=end_line)
-
     ## Context generation (structured output, called by TUI slash command)
 
     def generate_context(self, focus: str | None = None) -> BlueBoxAgentContext:
@@ -847,7 +714,7 @@ def generate_context(self, focus: str | None = None) -> BlueBoxAgentContext:
             "CRITICAL: routines_used must include every routine that was executed with exact "
             "routine_id, routine_name, and parameter values.\n"
             "Include the final working python_code snippet if post-processing was done.\n"
-            "Include output_files with relative paths of files written to outputs/.\n"
+            "Include output_files with relative paths of files written to output/.\n"
         )
         if raw_routines:
             system_prompt += "\nRoutines found in execution results:\n"
@@ -879,14 +746,22 @@ def generate_context(self, focus: str | None = None) -> BlueBoxAgentContext:
             )
 
         # Save canonical JSON
-        json_save = self._workspace.save_file(
-            "context", "agent_context.json", context.model_dump_json(indent=2),
+        json_ref = self._workspace.save_artifact(
+            "context",
+            "agent_context.json",
+            context.model_dump_json(indent=2),
         )
 
         # Save companion Markdown
-        md_save = self._workspace.save_file(
-            "context", "agent_context.md", context.to_markdown(),
+        md_ref = self._workspace.save_artifact(
+            "context",
+            "agent_context.md",
+            context.to_markdown(),
         )
 
-        logger.info("Context files saved: %s, %s", json_save["output_file"], md_save["output_file"])
+        logger.info(
+            "Context files saved: %s, %s",
+            self._workspace.root_path / json_ref.relative_path,
+            self._workspace.root_path / md_ref.relative_path,
+        )
         return context
diff --git a/bluebox/agents/principal_investigator.py b/bluebox/agents/principal_investigator.py
new file mode 100644
index 00000000..ed9a0312
--- /dev/null
+++ b/bluebox/agents/principal_investigator.py
@@ -0,0 +1,2751 @@
+"""
+bluebox/agents/principal_investigator.py
+
+PrincipalInvestigator (PI) agent — the orchestrator for Phase 2: Experiment-Driven
+Routine Construction.
+
+The PI has NO browser and NO domain tools. It only:
+- Reads exploration summaries (in its system prompt)
+- Reads the Discovery Ledger (routines planned, experiments, proven artifacts)
+- Plans what routines to build from the exploration data
+- Creates experiment tasks with specific hypotheses
+- Records findings and proven artifacts
+- Assembles routines and submits them for inspection
+- Ships a catalog of routines when done
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError, as_completed
+from datetime import datetime
+from textwrap import dedent
+from typing import Any, Callable, TYPE_CHECKING
+
+from pydantic import BaseModel, ValidationError
+from toon import encode as toon_encode
+
+from bluebox.agents.abstract_agent import (
+    AbstractAgent,
+    AgentCard,
+    AutonomousRunConfig,
+    ToolResultPersistMode,
+    agent_tool,
+)
+from bluebox.agents.routine_inspector import RoutineInspector
+from bluebox.data_models.orchestration.inspection import RoutineInspectionResult
+from bluebox.workspace import AgentWorkspace
+from bluebox.agents.workers.experiment_worker import ExperimentWorker
+from bluebox.data_models.llms.interaction import (
+    Chat,
+    ChatRole,
+    ChatThread,
+    EmittedMessage,
+)
+from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
+from bluebox.data_models.orchestration.experiment import (
+    ArtifactType,
+    ExperimentEntry,
+    ExperimentStatus,
+    ExperimentTakeaway,
+    ExperimentVerdict,
+)
+from bluebox.data_models.orchestration.ledger import (
+    DiscoveryLedger,
+    RoutineAttempt,
+    RoutineAttemptStatus,
+    RoutineCatalog,
+    RoutineSpec,
+    RoutineSpecStatus,
+    ShippedRoutine,
+)
+from bluebox.data_models.orchestration.task import (
+    SubAgent,
+    Task,
+    TaskStatus,
+    SpecialistAgentType,
+)
+from bluebox.data_models.orchestration.state import AgentOrchestrationState
+from bluebox.data_models.routine.execution import RoutineExecutionResultWithMetadata
+from bluebox.data_models.routine.routine import Routine
+from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader
+from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
+from bluebox.llms.data_loaders.storage_data_loader import StorageDataLoader
+from bluebox.llms.data_loaders.window_property_data_loader import WindowPropertyDataLoader
+from bluebox.utils.logger import get_logger
+
+if TYPE_CHECKING:
+    from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader
+    from websocket import WebSocket
+
+logger = get_logger(name=__name__)
+
+
+# ---------------------------------------------------------------------------
+# Worker tool descriptions — injected into PI system prompt so the PI knows
+# what workers can do and references tools by name in experiment methodologies.
+# ---------------------------------------------------------------------------
+
+WORKER_CAPABILITIES = dedent("""\
+    ## Worker Capabilities
+
+    Workers have access to the following tools. When writing experiment methodologies,
+    reference these tools by name so the worker knows exactly what to use.
+
+    BROWSER TOOLS (act in the live browser):
+      browser_navigate(url) — go to a URL and wait for page load.
+        TIP: Navigating directly to an API URL (e.g. https://api.example.com/data)
+        bypasses CORS restrictions since it's a top-level navigation, not a fetch.
+        The worker can then read the page body to get the JSON response.
+      browser_eval_js(expression) — run JavaScript in the page context.
+        Use for fetch() calls, DOM reads, clicks, storage access.
+        If fetch() fails with CORS, try: mode 'no-cors', or navigate to the URL first.
+      browser_cdp_command(method, params) — raw Chrome DevTools Protocol command.
+        POWERFUL: Can intercept/modify network requests below the browser security layer.
+        Key CDP methods for bypassing CORS/auth issues:
+          - Fetch.enable + Fetch.continueRequest: intercept and modify requests
+          - Network.enable + Network.getResponseBody: capture responses at protocol level
+          - Network.setExtraHTTPHeaders: add headers to all requests
+          - Page.navigate: navigate and capture response at CDP level
+        Workers should use this when browser_eval_js fetch() fails due to CORS.
+      browser_get_dom(selector?, max_depth?, include_tags?) — filtered view of current DOM
+
+    RECORDED LOOKUP TOOLS (search RECORDED session data — old, potentially stale):
+      search_recorded_transactions(query) — find requests in the recorded capture
+      get_recorded_transaction(request_id) — get full recorded request/response details
+        USE THIS FIRST when an API call fails — it shows the exact headers, cookies,
+        and parameters that worked during the original recorded session.
+      search_recorded_storage(query) — find recorded storage events
+      trace_recorded_value(value) — find where a value appears across the recorded capture
+      get_recorded_dom_snapshot(snapshot_index?) — get recorded DOM structure
+      get_recorded_dom_elements(element_type, snapshot_index?) — get recorded element details
+
+    Workers also receive the exploration summaries as shared context.
+""")
+
+
+class PrincipalInvestigator(AbstractAgent):
+    """
+    Orchestrator agent for experiment-driven routine construction.
+
+    The PI reads exploration summaries, plans a catalog of routines,
+    dispatches experiments to ExperimentWorker agents, reviews results,
+    and assembles proven artifacts into shipped routines.
+
+    The PI is self-organizing — no strict phases. It decides what to work on,
+    when to switch routines, and when to call it done.
+    """
+
+    # Maximum time (seconds) a single worker or inspector can run before being killed.
+    # Covers both LLM call hangs and browser/CDP hangs.
+    WORKER_TIMEOUT_SECONDS: int = 180  # 3 minutes
+    # Minimum experiments for a specific routine before allowing mark_routine_failed.
+    MIN_EXPERIMENTS_BEFORE_ROUTINE_FAILURE: int = 2
+    # If execution payload exceeds this size, persist it to inspector workspace raw/
+    # and have the inspector analyze from file instead of inline prompt JSON.
+    INSPECTOR_INLINE_EXECUTION_MAX_CHARS: int = 20_000
+    # Default worker experiment output schema when PI omits one.
+    # This enforces finalize_with_output(output={...}) instead of finalize_result.
+    #TODO: test if we can safely remove this without breaking the worker agent
+    DEFAULT_WORKER_OUTPUT_SCHEMA: dict[str, Any] = {
+        "type": "object",
+        "description": "Structured experiment findings object.",
+        "additionalProperties": True,
+    }
+    # Error patterns → common-issues doc paths.
+    # When an experiment result contains these keywords, the matching doc is
+    # auto-injected into the get_experiment_result response so the PI sees
+    # remediation guidance without having to search for it.
+    _ERROR_DOC_PATTERNS: list[tuple[list[str], str]] = [
+        (["failed to fetch", "typeerror: failed", "cors"], "common-issues/cors-failed-to-fetch.md"),
+        (["401", "403", "unauthorized", "forbidden"], "common-issues/unauthenticated.md"),
+        (["<html", "<!doctype", "cloudflare"], "common-issues/fetch-returns-html.md"),
+        (["element not found", "selector"], "common-issues/element-not-found.md"),
+        (["timeout", "timed out"], "common-issues/execution-timeout.md"),
+        (["placeholder", "unresolved"], "common-issues/placeholder-not-resolved.md"),
+    ]
+
+    AGENT_CARD = AgentCard(
+        description=(
+            "Orchestrates experiment-driven routine construction. Reads exploration "
+            "summaries, plans a routine catalog, dispatches experiments to workers, "
+            "reviews results, and assembles proven artifacts into shipped routines."
+        ),
+    )
+
+    # -----------------------------------------------------------------------
+    # System prompts
+    # -----------------------------------------------------------------------
+
+    SYSTEM_PROMPT_CORE: str = dedent("""\
+        You are a Principal Investigator (PI) for automated routine construction.
+
+        ## Your Role
+
+        You are the strategist. You have NO browser and NO domain tools. You:
+        1. Read exploration summaries to understand the captured session
+        2. Plan what routines to build (a catalog — often multiple routines)
+        3. Design experiments with specific, falsifiable hypotheses
+        4. Dispatch experiments to workers who have browser + capture lookup tools
+        5. Review results and record verdicts
+        6. Accumulate proven artifacts (fetches, navigations, tokens, parameters)
+        7. Assemble routines and submit them for validation + inspection
+        8. Ship routines that pass, iterate on ones that fail
+        9. Call mark_complete when all routines are addressed
+
+        ## Data + Python Access (PI and Workers)
+
+        You and your workers both have Python execution tools and workspace access.
+        Capture files are mounted under each agent workspace `raw/` directory.
+
+        Use this intentionally:
+        - For quick, precise analysis, call `execute_python` on the PI or delegate
+          Python analysis to workers.
+        - In Python, read mounted capture files from `raw/` when needed (for example,
+          to inspect exact payloads or run focused filtering).
+        - Prefer targeted queries over broad dumps; summarize findings back into
+          concise experiment methodologies and routine designs.
+
+        ## MANDATORY: Review Documentation First
+
+        BEFORE planning or dispatching ANY experiments, you MUST review the routine
+        documentation to understand the full capabilities available to you. Call:
+
+        1. search_files(scope="docs", query="operation", mode="exact") — to see all operation types (navigate, fetch,
+           click, input_text, js_evaluate, get_cookies, download, etc.)
+        2. read_file(scope="docs", path="...") on the Routine and operation model files to understand
+           what each operation can do and its required fields
+
+        This is NOT optional. You cannot dispatch experiments until you have reviewed
+        the documentation. Routines are much more powerful than simple fetch calls —
+        they support UI automation (click, type, scroll), JavaScript evaluation,
+        cookie extraction, file downloads, and more. Understanding these capabilities
+        BEFORE you plan will lead to better routine designs.
+
+        ## Catalog-First Thinking
+
+        AFTER reviewing docs, call plan_routines to declare what routines you intend
+        to build. The exploration data reveals the API surface — each distinct
+        capability can be a routine.
+
+        Then work through routines by priority, starting with shared dependencies
+        (auth, navigation) that apply across routines.
+
+        ## How Experiments Work
+
+        Each experiment has:
+        - **hypothesis**: What you're testing (specific and falsifiable)
+        - **rationale**: WHY — what evidence led here, what you expect to learn
+        - **methodology**: Instructions for the worker (reference worker tools by name!)
+
+        The worker executes the experiment and returns structured output.
+        You review the output, record a verdict, and decide what to try next.
+
+        ## Experiment Pattern — Exact Replay First
+
+        For API endpoint validation, your DEFAULT first experiment should be:
+        "Perform the exact fetch request as observed in capture, then report what changed."
+
+        Your experiment methodology should explicitly instruct the worker to:
+        1. Call `search_recorded_transactions` to find the candidate request
+        2. Call `get_recorded_transaction` for the exact captured request
+        3. Re-run the same request in live browser using `browser_navigate` + `browser_eval_js`
+        4. Report a field-by-field diff:
+           - URL/path/query differences
+           - Header differences
+           - Body shape/value differences
+           - Status code and response shape differences
+        5. If the replay fails, test one minimal delta at a time (never many at once)
+
+        Include this sentence in methodologies when relevant:
+        "Replay the exact captured fetch first; do not generalize until exact replay is tested."
+
+        ## Strategy
+
+        1. Review documentation with search_files(scope="docs", ...) and read_file(scope="docs", ...) (MANDATORY first step)
+        2. Call plan_routines to declare your catalog plan
+        3. IDENTIFY AUTH DEPENDENCIES FIRST (see below) — solve auth before data endpoints
+        4. Use dispatch_experiments_batch to test routines in parallel (respecting dependencies)
+        5. Record findings, then batch the next round of experiments
+        6. For each routine: experiment → prove → assemble → submit_routine → ship or fix
+        7. submit_routine REQUIRES test_parameters — provide realistic values for EVERY parameter!
+           The routine will be executed in a live browser and reviewed by an independent inspector.
+           If the routine has 0 parameters, pass test_parameters: {}
+        9. When a routine passes inspection: mark_routine_shipped
+        10. When a routine is hopeless: mark_routine_failed
+        11. When all routines are addressed: mark_complete with a usage guide
+
+        ## Auth-First Dependency Ordering — CRITICAL
+
+        Most APIs require authentication (tokens, API keys, session cookies). If the
+        exploration summaries mention ANY of these patterns:
+        - JWT / Bearer tokens
+        - API keys / subscription keys (e.g. Ocp-Apim-Subscription-Key)
+        - Session tokens / CSRF tokens
+        - OAuth flows
+        - Login/cookie-based auth
+
+        Then you MUST follow this order:
+
+        **Phase A — Solve auth FIRST (before any data endpoints):**
+        1. Dispatch experiments ONLY for the auth/token routine
+        2. The worker must: find the token endpoint in captures, extract the exact
+           headers/body/key needed, call it live, and prove it returns a valid token
+        3. Record the proven auth artifact (token URL, required headers, subscription key)
+        4. Only proceed to Phase B after auth is CONFIRMED working
+
+        **Phase B — Test data endpoints WITH proven auth:**
+        1. NOW batch experiments for data endpoints
+        2. Each experiment methodology must include the proven auth details from Phase A
+           so the worker can authenticate before calling the data endpoint
+        3. Workers are independent — they do NOT share state. You must pass the
+           auth instructions (token URL, headers, key) in every experiment methodology.
+
+        **NEVER do this:**
+        - Batch auth + data experiments in parallel (data will fail without auth)
+        - Skip auth and hope data endpoints are public (check exploration first)
+        - Give up on data endpoints because they 401'd — solve auth first, then retry
+
+        **For public endpoints** (no auth required): batch freely in parallel.
+
+        ## Auth Token Resolution — Where Tokens Come From
+
+        Tokens and API keys can live in MANY places. When designing auth experiments,
+        you MUST explore MULTIPLE resolution strategies, not just one. If one fails,
+        try the next. Include observed values from captures so workers know what to
+        look for.
+
+        **Source 1: Network captures (token endpoints)**
+        The most common pattern — a dedicated API endpoint returns a token.
+        - Search captures for URLs containing "token", "auth", "login", "oauth"
+        - Get the EXACT headers and body from the capture with get_recorded_transaction
+        - Tell the worker: "Call POST {token_url} with these headers: {headers} and
+          body: {body}. In the capture the response had a field called {field} with
+          a token that looked like '{first_20_chars}...'"
+        - The routine chains this: fetch token → store → use in subsequent fetches
+
+        **Source 2: DOM (inline scripts, meta tags, data attributes)**
+        Sites embed tokens/keys directly in the HTML page.
+        - Meta tags: `<meta name="csrf-token" content="...">`
+        - Inline config: `window.__CONFIG__ = { apiKey: "..." }`
+        - Data attributes: `<div data-api-key="...">`
+        - Experiment methodology: "Navigate to {url}, then run JS to check:
+          document.querySelector('meta[name=csrf-token]'), window.__CONFIG__,
+          window.__INITIAL_STATE__, window.ENV. We saw a key like '{observed_value}'
+          in the captured session — is it still the same or has it changed?"
+        - Routine uses: `{{meta:csrf-token}}` or `{{windowProperty:__CONFIG__.apiKey}}`
+
+        **Source 3: Browser storage (localStorage / sessionStorage)**
+        Sites store tokens after their JS authenticates on page load.
+        - Experiment methodology: "Navigate to {url}, wait 3 seconds for JS to execute,
+          then dump sessionStorage and localStorage. Look for keys containing
+          'token', 'auth', 'jwt', 'session'. In the capture we saw '{key_name}'
+          with value starting '{prefix}...'"
+        - Routine uses: `{{localStorage:auth.access_token}}` or
+          `{{sessionStorage:token.jwt}}`
+
+        **Source 4: Cookies**
+        Some sites use cookie-based auth — navigation establishes the session.
+        - Experiment methodology: "Navigate to {url}, then try calling {data_endpoint}
+          with credentials:'include'. If it works, auth is cookie-based and the
+          routine just needs navigate + fetch with credentials:'include'. If it
+          fails, dump cookies with get_cookies to see what exists."
+        - Routine uses: `credentials: "include"` or `{{cookie:XSRF-TOKEN}}`
+
+        **Source 5: Window properties (JS globals)**
+        Sites set global variables with config and auth.
+        - Experiment methodology: "Navigate to {url}, run JS to check window.__CONFIG__,
+          window.__INITIAL_STATE__, window.ENV, window.__NEXT_DATA__"
+        - Routine uses: `{{windowProperty:__CONFIG__.apiKey}}`
+
+        **Source 6: JS evaluation (compute from page state)**
+        When tokens are derived/computed by the site's JS and stored in non-obvious places.
+        - Experiment methodology: "Navigate to {url}, wait for page load. The site's JS
+          likely stores auth state somewhere. Try: JSON.parse(sessionStorage.getItem(
+          'persist:root')).auth, or look through all sessionStorage keys for anything
+          containing 'token'. Extract the value and try using it."
+        - Routine uses: js_evaluate operation to extract + store in sessionStorage
+
+        **CRITICAL: When dispatching auth experiments, ALWAYS include:**
+        1. The observed token/key value (or first 20 chars) from the captured session
+        2. Where you found it in captures (which header, which response field)
+        3. Whether it appears static (same across captures) or dynamic (different each time)
+        4. Multiple strategies to try — "First try X, if that fails try Y, then Z"
+
+        ## Hardcoding Site-Level Credentials — CRITICAL
+
+        Many sites use API keys, subscription keys, or client IDs that are NOT user
+        secrets — they are site-wide constants baked into the website's JavaScript,
+        HTML meta tags, or network requests. Examples:
+        - Ocp-Apim-Subscription-Key
+        - x-api-key / apiKey / client_id
+        - Firebase API keys
+        - Public OAuth client IDs
+
+        These MUST be resolved from captures (network headers, DOM, storage) and
+        HARDCODED directly into the routine. They must NEVER be exposed as user
+        parameters — no user would know where to find them.
+
+        **Resolution order for static keys:**
+        1. Network captures: check request headers from get_recorded_transaction
+        2. DOM: check inline scripts, meta tags, window.* config objects
+        3. Storage: check localStorage/sessionStorage for cached keys
+        4. If found in captures, hardcode the value directly in routine headers/body
+
+        **JWT/Bearer tokens are DIFFERENT** — they expire and must be fetched at
+        runtime via a fetch operation within the routine. But the API key USED TO
+        fetch the token should itself be hardcoded.
+
+        **When building routines:** only parameterize values that a USER would
+        naturally provide (search terms, dates, IDs, locations). Everything else
+        should be hardcoded from captures.
+
+        ## Parallel Experiments — ALWAYS PREFER BATCH (within dependency order)
+
+        ALWAYS use dispatch_experiments_batch instead of dispatch_experiment when you
+        have 2+ INDEPENDENT experiments to run. This runs them IN PARALLEL on separate
+        workers — N experiments complete in the time of 1.
+
+        But NEVER batch experiments that have unresolved dependencies on each other.
+        Auth must be solved before data endpoints. Reference data (e.g. station lists)
+        should be solved before parameterized endpoints that depend on those IDs.
+
+        dispatch_experiment (singular) should ONLY be used for one-off experiments.
+        For all new experiments, batch them with dispatch_experiments_batch.
+
+        Batch aggressively:
+        - After plan_routines, immediately batch experiments for all priority-1 routines
+        - When testing multiple API endpoints, batch them all at once
+        - When probing auth + multiple data endpoints, batch everything together
+
+        ## Routine Naming & Documentation Standards
+
+        These routines will be VECTORIZED and stored in databases for other agents to
+        discover via semantic search. Poor names and vague descriptions make routines
+        invisible and unusable. Follow these rules strictly:
+
+        **Routine name** — snake_case, verb_noun pattern, 3+ segments, MUST include site context:
+          The name must make sense in isolation — another agent reading ONLY the name
+          should know what site/service this targets and what it does. Include a short
+          site identifier as a prefix or qualifier.
+
+          GOOD: get_premierleague_standings, search_premierleague_matches_by_season,
+                fetch_amtrak_train_schedules, download_arxiv_paper_pdf,
+                list_espn_upcoming_fixtures, get_github_repo_stars
+          BAD:  get_standings (standings from where?), get_content_item (what content?
+                what site?), fetch_data (completely generic), search_matches (which sport?
+                which site?), get_league_standings (which league? which site?)
+
+        **Routine description** — ≥8 words, must explain:
+          1. What it does (the action)
+          2. What inputs it accepts (parameters)
+          3. What data it returns (the output)
+          GOOD: "Fetches Premier League standings for a given competition ID and
+                 season ID, returning team names, positions, wins, draws, losses,
+                 goals scored, goals conceded, and total points."
+          BAD:  "Get standings" (too short, no input/output info)
+          BAD:  "A routine for the Premier League" (doesn't say what it does or returns)
+
+        **Parameter names** — snake_case, descriptive:
+          GOOD: competition_id, season_year, team_name, departure_date
+          BAD:  id (ambiguous), param1 (meaningless), x (cryptic)
+
+        **Parameter descriptions** — ≥3 words, explain what the value represents:
+          GOOD: "The unique competition identifier (e.g. 1 for Premier League)"
+          GOOD: "Season year in YYYY format (e.g. 2024)"
+          BAD:  "ID" (too terse)
+          BAD:  "The season" (doesn't explain format or expected values)
+
+        **Non-obvious parameter sourcing** — CRITICAL for opaque IDs and codes:
+          If a parameter is NOT something a human would naturally know (e.g. an internal
+          numeric ID, a slug, an encoded token, a UUID), the description MUST explain
+          WHERE to get that value. The user calling this routine has no idea what
+          "competition_id: 1" means unless you tell them how to find it.
+
+          GOOD: "Internal competition ID. Obtain from the get_competitions routine or
+                 the /competitions API endpoint. Example: 1 = Premier League, 2 = Championship."
+          GOOD: "Season ID as used by the Premier League API. Use the get_seasons routine
+                 to list valid season IDs for a competition. Example: 418 = 2023-24 season."
+          GOOD: "Team slug as it appears in the site URL path (e.g. 'arsenal', 'manchester-united').
+                 Find by calling get_teams or navigating to the team page."
+          BAD:  "The competition ID" (where do I get it?)
+          BAD:  "Season identifier" (what values are valid? how do I look them up?)
+
+          Rule of thumb: if you can't google the value, the description must say how to get it.
+
+        ## CRITICAL RULES
+
+        - NEVER guess at request details. Always dispatch experiments to verify.
+        - Write experiment methodologies that reference worker tools by name.
+        - Record a verdict for EVERY completed experiment via record_finding.
+        - Always include reusable takeaways in record_finding so future workers
+          receive concrete lessons (claim + how_to_apply_next + evidence).
+        - If an experiment is ambiguous, dispatch a targeted follow-up experiment with more specific methodology.
+        - ALWAYS provide test_parameters when calling submit_routine — the routine
+          WILL be executed and inspected. Use realistic values the experiments proved work.
+          If the routine has 0 parameters, pass test_parameters: {}
+        - DEPENDENCY ORDER IS SACRED: auth → reference data → data endpoints → assembly.
+          NEVER dispatch data endpoint experiments until auth is CONFIRMED working.
+          NEVER give up on data endpoints just because they returned 401 — that means
+          you need to solve auth first, not that the endpoint is broken.
+        - Workers do NOT share browser state. When an endpoint requires auth, your
+          experiment methodology must include FULL auth instructions (token URL, headers,
+          subscription key) so the worker can authenticate within its own session.
+        - mark_routine_failed is globally gated: you cannot fail any routine until at
+          least 5 routine attempts have failed across the pipeline. Keep iterating and
+          submitting improved routines before giving up on individual specs.
+
+        ## Resilience — NEVER Give Up Early
+
+        - NEVER call mark_failed after fewer than 5 experiments per routine.
+          CORS failures, 400 errors, and network issues are NORMAL obstacles, not
+          reasons to quit. They mean you need a different approach, not that the
+          pipeline is hopeless.
+        - When a fetch fails (CORS, 400, timeout), iterate with alternative approaches:
+          1. Use search_recorded_transactions / get_recorded_transaction to see the EXACT
+             request headers and patterns that worked in the recorded session, then
+             replicate them in the worker's experiment.
+          2. Use browser_cdp_command with Fetch.enable to intercept requests at the CDP
+             level — this bypasses CORS entirely since it operates below the browser
+             security layer.
+          3. Try navigating directly to the API URL with browser_navigate — GET requests
+             via top-level navigation don't have CORS restrictions.
+          4. Try fetch with mode: 'no-cors' or from a different origin context.
+          5. Check if the site's JS uses a proxy path (e.g. /api/* proxied to the API
+             domain) — search the captured network data for path patterns.
+        - If ALL alternative approaches fail for a routine, mark_routine_failed for THAT
+          routine and move on to the next one. Do NOT call mark_failed (pipeline-level)
+          unless every single routine has been individually addressed.
+        - When results are unclear, dispatch a focused follow-up experiment rather than
+          guessing — experiments are cheap compared to shipping a broken routine.
+
+        ## Common Execution Failures — MUST READ
+
+        ### TypeError: Failed to fetch (CORS)
+        If a routine's fetch operation fails with "TypeError: Failed to fetch", this
+        almost always means the browser's current origin doesn't match the API's
+        CORS Access-Control-Allow-Origin header. Routines start from about:blank
+        (origin = null), so ANY cross-origin fetch will fail without navigation.
+
+        The fix is to add a `navigate` operation BEFORE the first fetch to set the
+        browser origin to the allowed domain.
+
+        Example: If the API is at https://api.example.com but CORS only allows
+        https://www.example.com, the routine MUST start with:
+          {"type": "navigate", "url": "https://www.example.com"}
+        before any fetch to https://api.example.com/...
+
+        RULE: Every routine that calls an external API MUST start with a navigate
+        operation. This is cheap (one page load) and prevents CORS issues. If you
+        see "Failed to fetch" in an inspection blocking issue, ADD A NAVIGATE OP.
+
+        For more details: search_files(scope="docs", query="cors-failed-to-fetch", mode="exact")
+
+        ### HTTP 401/403 (Authentication)
+        If a fetch returns 401/403, the routine is missing authentication. Check
+        experiment findings for auth token endpoints and subscription keys. The
+        routine must obtain a token (via fetch + js_evaluate) before calling
+        protected endpoints. For more details: search_files(scope="docs", query="unauthenticated", mode="exact")
+    """)
+
+    # -----------------------------------------------------------------------
+    # Constructor
+    # -----------------------------------------------------------------------
+
+    def __init__(
+        self,
+        emit_message_callable: Callable[[EmittedMessage], None],
+        task: str,
+        # Exploration summaries — injected into system prompt
+        exploration_summaries: dict[str, str] | None = None,
+        # Data loaders — passed through to workers
+        network_data_loader: NetworkDataLoader | None = None,
+        storage_data_loader: StorageDataLoader | None = None,
+        dom_data_loader: DOMDataLoader | None = None,
+        window_property_data_loader: WindowPropertyDataLoader | None = None,
+        documentation_data_loader: DocumentationDataLoader | None = None,
+        # Browser context — passed through to workers
+        remote_debugging_address: str | None = None,
+        # Resume support — pass an existing ledger to pick up where a previous PI left off
+        ledger: DiscoveryLedger | None = None,
+        # LLM config
+        llm_model: LLMModel = OpenAIModel.GPT_5_1,
+        worker_llm_model: LLMModel | None = None,
+        max_iterations: int = 200,
+        worker_max_loops: int = 30,
+        max_attempts_per_routine: int = 5,
+        min_experiments_before_fail: int = 10,
+        min_global_failed_attempts_before_routine_failure: int = 5,
+        # Agent pool sizes
+        num_workers: int = 3,
+        num_inspectors: int = 1,
+        # Persistence callbacks
+        on_ledger_change: Callable[[DiscoveryLedger, str], None] | None = None,
+        on_agent_thread: Callable[[str, str, list[dict[str, Any]]], None] | None = None,
+        on_attempt_record: Callable[[dict[str, Any]], None] | None = None,
+        # Standard agent args
+        persist_chat_callable: Callable[[Chat], Chat] | None = None,
+        persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None,
+        stream_chunk_callable: Callable[[str], None] | None = None,
+        chat_thread: ChatThread | None = None,
+        existing_chats: list[Chat] | None = None,
+        workspace: AgentWorkspace | None = None,
+        worker_workspace_factory: Callable[[], AgentWorkspace] | None = None,
+        inspector_workspace_factory: Callable[[], AgentWorkspace] | None = None,
+    ) -> None:
+        # Task
+        self._task = task
+        self._max_iterations = max_iterations
+        self._worker_max_loops = worker_max_loops
+        self._max_attempts_per_routine = max_attempts_per_routine
+        self._min_experiments_before_fail = min_experiments_before_fail
+        self._min_global_failed_attempts_before_routine_failure = max(
+            0,
+            int(min_global_failed_attempts_before_routine_failure),
+        )
+
+        # Exploration context
+        raw_summaries = exploration_summaries or {}
+        self._exploration_summaries_raw = dict(raw_summaries)
+        toonified_summaries: dict[str, str] = {}
+        for domain, summary in raw_summaries.items():
+            if not isinstance(summary, str):
+                toonified_summaries[domain] = toon_encode(summary)
+                continue
+
+            stripped = summary.strip()
+            if not stripped:
+                toonified_summaries[domain] = summary
+                continue
+
+            try:
+                parsed = json.loads(stripped)
+            except json.JSONDecodeError:
+                toonified_summaries[domain] = summary
+                continue
+
+            toonified_summaries[domain] = toon_encode(parsed)
+        self._exploration_summaries = toonified_summaries
+
+        # Data loaders (passed through to workers)
+        self._network_data_loader = network_data_loader
+        self._storage_data_loader = storage_data_loader
+        self._dom_data_loader = dom_data_loader
+        self._window_property_data_loader = window_property_data_loader
+        self._documentation_data_loader = documentation_data_loader
+
+        # Browser context (passed through to workers)
+        self._remote_debugging_address = remote_debugging_address
+
+        # LLM
+        self._worker_llm_model = worker_llm_model or llm_model
+
+        # Agent pools
+        self._num_workers = num_workers
+        self._num_inspectors = num_inspectors
+        self._worker_counter = 0  # Round-robin counter for workers
+        self._inspector_counter = 0  # Round-robin counter for inspectors
+
+        # Persistence callbacks
+        self._on_ledger_change = on_ledger_change
+        self._on_agent_thread = on_agent_thread
+        self._on_attempt_record = on_attempt_record
+        self._worker_workspace_factory = worker_workspace_factory
+        self._inspector_workspace_factory = inspector_workspace_factory
+
+        # Internal state — the Discovery Ledger tracks everything
+        # Accept an existing ledger for resume after context exhaustion
+        self._ledger = ledger or DiscoveryLedger(user_task=task)
+        self._orchestration_state = AgentOrchestrationState()
+        self._agent_instances: dict[str, AbstractAgent] = {}
+        self._is_done = False
+        self._pipeline_result: RoutineCatalog | None = None
+        self._recent_tool_calls: list[str] = []  # Track recent tool names for loop detection
+        self._docs_reviewed: bool = False  # Gate: must review docs before dispatching experiments
+
+        super().__init__(
+            emit_message_callable=emit_message_callable,
+            workspace=workspace,
+            persist_chat_callable=persist_chat_callable,
+            persist_chat_thread_callable=persist_chat_thread_callable,
+            stream_chunk_callable=stream_chunk_callable,
+            llm_model=llm_model,
+            chat_thread=chat_thread,
+            existing_chats=existing_chats,
+            documentation_data_loader=documentation_data_loader,
+            allow_code_execution=True,
+        )
+
+        logger.debug(
+            "PrincipalInvestigator initialized: task=%s, explorations=%s",
+            task[:80],
+            list(self._exploration_summaries.keys()),
+        )
+
+    # -----------------------------------------------------------------------
+    # System prompt
+    # -----------------------------------------------------------------------
+
+    def _get_system_prompt(self) -> str:
+        parts: list[str] = [self.SYSTEM_PROMPT_CORE]
+        if self.has_workspace:
+            try:
+                summary = self._require_workspace().generate_summary()
+                parts.append(f"\n\n## Workspace Summary\n{summary}")
+            except Exception as e:
+                logger.warning("Failed to generate PI workspace summary: %s", e)
+
+        # Routine JSON schema — auto-generated from the Pydantic models
+        parts.append("\n## Routine JSON Schema\n\n")
+        parts.append("When calling submit_routine, the routine_json MUST conform to this schema.\n")
+        parts.append("Every operation needs a 'type' field as discriminator.\n\n")
+        parts.append(Routine.model_schema_markdown())
+        parts.append("\n\n### Example routine_json (fetch a parameterized API)\n\n")
+        parts.append(dedent("""\
+            ```json
+            {
+              "name": "get_premierleague_standings",
+              "description": "Fetches Premier League standings for a given competition and season, returning team names, positions, wins, draws, losses, goals scored, goals conceded, and total points.",
+              "parameters": [
+                {"name": "competition_id", "type": "integer", "description": "Internal competition identifier. Obtain from the get_premierleague_competitions routine. Example: 1 = Premier League."},
+                {"name": "season_id", "type": "integer", "description": "Season identifier as used by the Premier League API. Obtain from the get_premierleague_seasons routine. Example: 418 = 2023-24 season."}
+              ],
+              "operations": [
+                {
+                  "type": "navigate",
+                  "url": "https://www.example.com"
+                },
+                {
+                  "type": "fetch",
+                  "endpoint": {
+                    "url": "https://api.example.com/competitions/{{competitionId}}/seasons/{{seasonId}}/standings",
+                    "method": "GET"
+                  },
+                  "session_storage_key": "standings_result"
+                },
+                {
+                  "type": "return",
+                  "session_storage_key": "standings_result"
+                }
+              ]
+            }
+            ```
+        """))
+
+        # Worker capabilities
+        parts.append(WORKER_CAPABILITIES)
+        parts.append(self._generate_code_execution_prompt())
+
+        # Exploration summaries
+        if self._exploration_summaries:
+            parts.append("\n## Exploration Summaries\n")
+            for domain, summary in self._exploration_summaries.items():
+                parts.append(f"### {domain}\n{summary}\n")
+
+        # Discovery Ledger
+        if self._ledger.routine_specs or self._ledger.experiments or self._ledger.attempts:
+            ledger_payload = self._ledger.model_dump(
+                mode="json",
+                exclude={
+                    "experiments": {
+                        "__all__": {
+                            "prompt": True,
+                            "output": True,
+                        }
+                    },
+                    "attempts": {
+                        "__all__": {
+                            "routine_json": True,
+                            "execution_result": True,
+                            "inspection_result": True,
+                        }
+                    },
+                },
+            )
+            parts.append("\n## Discovery Ledger\n")
+            parts.append(toon_encode(ledger_payload))
+
+        # Task queue status
+        queue = self._orchestration_state.get_queue_status()
+        if any(v > 0 for v in queue.values()):
+            parts.append("\n## Task Queue\n")
+            parts.append(toon_encode(queue))
+
+        return "".join(parts)
+
+    # -----------------------------------------------------------------------
+    # Public entry point
+    # -----------------------------------------------------------------------
+
+    def run(self) -> RoutineCatalog | None:
+        """
+        Run the PI loop to completion.
+
+        Returns:
+            A RoutineCatalog of shipped routines, or None if construction failed.
+        """
+        # Seed the conversation — detect resume vs fresh start
+        is_resume = bool(self._ledger.experiments or self._ledger.routine_specs)
+
+        if is_resume:
+            initial_message = (
+                f"TASK: {self._task}\n\n"
+                "You are RESUMING a previous session that ran out of context. "
+                "Your Discovery Ledger has been preserved with all prior work.\n\n"
+                "FIRST: Call get_ledger to see exactly where things stand — "
+                "what routines are planned, what experiments have been run, "
+                "what's shipped, and what still needs work.\n\n"
+                "Then pick up where the previous session left off. Do NOT repeat "
+                "experiments that already have verdicts."
+            )
+            logger.info(
+                "PI resuming: %d specs, %d experiments, %d attempts",
+                len(self._ledger.routine_specs),
+                len(self._ledger.experiments),
+                len(self._ledger.attempts),
+            )
+        else:
+            initial_message = (
+                f"TASK: {self._task}\n\n"
+                "MANDATORY FIRST STEP: Review the routine documentation before doing anything else.\n"
+                "Call search_files(scope='docs', query='operation', mode='exact') and read_file(scope='docs', path='...') on the Routine/operation model files\n"
+                "to understand ALL operation types (fetch, click, input_text, js_evaluate, download, etc.).\n"
+                "You CANNOT dispatch experiments until you understand the full routine capabilities.\n\n"
+                "After reviewing docs:\n"
+                "1. Analyze the exploration summaries\n"
+                "2. Call plan_routines to declare what routines to build\n"
+                "3. Use dispatch_experiments_batch to test ALL priority-1 routines IN PARALLEL\n"
+                "4. Record findings for each, then batch the next round\n"
+                "5. Build and submit_routine for each proven routine\n"
+                "6. Call mark_complete when all routines are shipped or failed\n\n"
+                "IMPORTANT: Always use dispatch_experiments_batch (not dispatch_experiment) "
+                "to run multiple experiments in parallel. This is much faster."
+            )
+        self._add_chat(ChatRole.USER, initial_message)
+
+        for iteration in range(self._max_iterations):
+            if self._is_done:
+                logger.info("PI completed after %d iterations", iteration)
+                self._dump_agent_thread("principal_investigator", self)
+                return self._pipeline_result
+
+            messages = self._build_messages_for_llm()
+            response = self._call_llm(
+                messages,
+                self._get_system_prompt(),
+                tool_choice="required",
+            )
+
+            # Add assistant response to chat
+            self._add_chat(
+                ChatRole.ASSISTANT,
+                content=response.content or "",
+                tool_calls=response.tool_calls,
+                llm_provider_response_id=response.response_id,
+            )
+
+            # Persist PI thread after every iteration
+            self._dump_agent_thread("principal_investigator", self)
+
+            if response.tool_calls:
+                self._process_tool_calls(response.tool_calls)
+
+                # Track docs review — any docs tool call satisfies the gate
+                _DOCS_TOOLS = {"search_files", "read_file", "list_files"}
+                for tc in response.tool_calls:
+                    if tc.tool_name in _DOCS_TOOLS:
+                        scope = (tc.tool_arguments or {}).get("scope")
+                        if scope == "docs":
+                            self._docs_reviewed = True
+
+                # Loop detection: track recent tool calls (name + whether it errored)
+                for tc in response.tool_calls:
+                    self._recent_tool_calls.append(tc.tool_name)
+                # Keep only last 6
+                self._recent_tool_calls = self._recent_tool_calls[-6:]
+
+                # Detect stuck patterns:
+                # 1. Same tool called 3+ times in a row (any tool)
+                # 2. Alternating pair (e.g. mark_routine_failed + get_ledger)
+                _is_stuck = False
+                _stuck_msg = ""
+
+                if len(self._recent_tool_calls) >= 3:
+                    last3 = self._recent_tool_calls[-3:]
+                    # Same tool 3x in a row
+                    if len(set(last3)) == 1:
+                        _is_stuck = True
+                        _stuck_msg = f"calling {last3[0]} repeatedly"
+
+                if not _is_stuck and len(self._recent_tool_calls) >= 4:
+                    last4 = self._recent_tool_calls[-4:]
+                    # Alternating pair: A B A B
+                    if last4[0] == last4[2] and last4[1] == last4[3] and last4[0] != last4[1]:
+                        _is_stuck = True
+                        _stuck_msg = f"alternating between {last4[0]} and {last4[1]}"
+
+                if _is_stuck:
+                    self._recent_tool_calls.clear()
+                    shipped = sum(1 for s in self._ledger.routine_specs if s.status == RoutineSpecStatus.SHIPPED)
+                    failed = sum(1 for s in self._ledger.routine_specs if s.status == RoutineSpecStatus.FAILED)
+                    unaddressed = [
+                        s for s in self._ledger.routine_specs
+                        if s.status not in (RoutineSpecStatus.SHIPPED, RoutineSpecStatus.FAILED)
+                    ]
+                    unaddressed_names = [f"{s.name} ({s.status.value})" for s in unaddressed]
+                    self._add_chat(
+                        ChatRole.USER,
+                        f"STOP — you are stuck {_stuck_msg}. This is wasting iterations.\n\n"
+                        f"Current progress: {shipped} shipped, {failed} failed, "
+                        f"{len(unaddressed)} unaddressed.\n"
+                        f"Unaddressed routines: {', '.join(unaddressed_names) or 'none'}\n\n"
+                        "If a tool keeps returning an error, do NOT retry the same call. "
+                        "Read the error message and change your approach.\n\n"
+                        "To make progress you MUST do ONE of these:\n"
+                        "1. dispatch_experiments_batch for unaddressed routines\n"
+                        "2. submit_routine with VALID routine_json AND test_parameters "
+                        "(if routine has 0 params, pass test_parameters: {})\n"
+                        "3. mark_routine_failed for routines that truly can't work\n"
+                        "4. mark_complete if all routines are shipped or failed\n\n"
+                        "Pick ONE action for a DIFFERENT routine than the one you're stuck on.",
+                    )
+            else:
+                # Nudge the PI to act
+                self._add_chat(
+                    ChatRole.USER,
+                    "You must use a tool. Dispatch an experiment, record a finding, "
+                    "submit a routine, or mark_complete if done.",
+                )
+
+        logger.warning("PI exhausted %d iterations without completing", self._max_iterations)
+        # Dump PI thread before returning partial results
+        self._dump_agent_thread("principal_investigator", self)
+        # Return whatever we've shipped so far
+        return self._build_partial_catalog()
+
+    # ===================================================================
+    # Persistence — notify external listener after ledger mutations
+    # ===================================================================
+
+    def _persist(self, reason: str) -> None:
+        """Fire the on_ledger_change callback if registered."""
+        if self._on_ledger_change is not None:
+            try:
+                self._on_ledger_change(self._ledger, reason)
+            except Exception as e:
+                logger.warning("on_ledger_change callback failed: %s", e)
+
+    def _record_attempt(
+        self,
+        spec: RoutineSpec,
+        attempt: RoutineAttempt,
+        routine_json: dict[str, Any],
+        test_parameters: dict[str, Any],
+        execution_result: RoutineExecutionResultWithMetadata | None,
+        inspection_result: dict[str, Any] | None,
+    ) -> None:
+        """Persist a unified attempt record via the on_attempt_record callback."""
+        if self._on_attempt_record is None:
+            return
+        try:
+            # Count which attempt number this is for the spec
+            spec_attempts = self._ledger.get_attempts_for_spec(spec.id)
+            attempt_number = len(spec_attempts)
+
+            record: dict[str, Any] = {
+                "attempt_id": attempt.id,
+                "spec_id": spec.id,
+                "spec_name": spec.name,
+                "spec_description": spec.description,
+                "attempt_number": attempt_number,
+                "timestamp": str(datetime.now()),
+                "verdict": attempt.status.value,
+                "routine_json": routine_json,
+                "test_parameters": test_parameters,
+                "execution_result": (
+                    execution_result.model_dump() if execution_result is not None else None
+                ),
+                "inspection_result": inspection_result,
+            }
+            self._on_attempt_record(record)
+        except Exception as e:
+            logger.warning("on_attempt_record callback failed: %s", e)
+
+    def _dump_agent_thread(self, agent_label: str, agent: AbstractAgent) -> None:
+        """Dump an agent's full message history via the on_agent_thread callback."""
+        if self._on_agent_thread is None:
+            return
+        try:
+            chats = agent.get_chats()
+            messages = []
+            for c in chats:
+                try:
+                    msg = {
+                        "id": c.id,
+                        "role": c.role.value if c.role else "unknown",
+                        "content": c.content or "",
+                        "tool_calls": [
+                            {"tool_name": tc.tool_name, "arguments": tc.tool_arguments, "call_id": tc.call_id}
+                            for tc in (c.tool_calls or [])
+                        ],
+                        "tool_call_id": c.tool_call_id,
+                        "created_at": str(c.created_at) if c.created_at else None,
+                    }
+                    messages.append(msg)
+                except Exception as chat_err:
+                    logger.warning("Failed to serialize chat %s for %s: %s", c.id, agent_label, chat_err)
+                    messages.append({"id": c.id, "role": "unknown", "content": f"[serialization error: {chat_err}]"})
+            thread_id = agent.get_thread().id if agent.get_thread() else "unknown"
+            self._on_agent_thread(agent_label, thread_id, messages)
+        except Exception as e:
+            logger.warning("on_agent_thread callback failed for %s: %s", agent_label, e, exc_info=True)
+
+    # ===================================================================
+    # CATALOG PLANNING TOOLS
+    # ===================================================================
+
+    @agent_tool()
+    def _plan_routines(
+        self,
+        specs: list[dict[str, Any]],
+    ) -> dict[str, Any]:
+        """
+        Declare what routines to build from the exploration data.
+
+        Call this early after analyzing exploration summaries. Each spec
+        represents a distinct capability to extract from the site.
+        Can be called again to add new specs discovered during experimentation.
+
+        Args:
+            specs: List of routine specs. Each dict has:
+                - name: Short name (e.g. "get_league_standings")
+                - description: What the routine does
+                - priority: 1=must-have, 2=should-have, 3=nice-to-have (default 1)
+        """
+        created_ids: list[str] = []
+        for spec_dict in specs:
+            spec = RoutineSpec(
+                name=spec_dict.get("name") or spec_dict.get("id", "unnamed"),
+                description=spec_dict.get("description", ""),
+                priority=spec_dict.get("priority", 1),
+            )
+            self._ledger.add_spec(spec)
+            created_ids.append(spec.id)
+
+        # Auto-set the first one as active if none is active
+        if self._ledger.active_spec_id is None and self._ledger.routine_specs:
+            self._ledger.active_spec_id = self._ledger.routine_specs[0].id
+
+        self._persist("plan_routines")
+        return {
+            "created": len(created_ids),
+            "spec_ids": created_ids,
+            "total_specs": len(self._ledger.routine_specs),
+        }
+
+    @agent_tool()
+    def _set_active_routine(self, spec_id: str) -> dict[str, Any]:
+        """
+        Switch focus to a different routine. The PI works on one routine at a
+        time but can switch when blocked or when dependencies are shared.
+
+        Args:
+            spec_id: ID of the RoutineSpec to focus on.
+        """
+        spec = self._ledger.get_spec(spec_id)
+        if spec is None:
+            return {"error": f"No spec found with ID: {spec_id}"}
+
+        self._ledger.active_spec_id = spec_id
+        return {"active": spec.name, "status": spec.status.value}
+
+    # ===================================================================
+    # EXPERIMENT TOOLS
+    # ===================================================================
+
+    def _truncate_text_for_briefing(self, value: Any, max_chars: int = 220) -> str:
+        """Normalize arbitrary values to a compact, single-line string."""
+        text = value if isinstance(value, str) else json.dumps(value, default=str)
+        text = text.replace("\n", " ").strip()
+        if len(text) > max_chars:
+            return text[:max_chars] + "..."
+        return text
+
+    def _build_worker_briefing(self, routine_spec_id: str | None) -> str:
+        """
+        Build compact reusable context for workers from ledger state.
+
+        Includes proven artifacts, reusable experiment takeaways, and known blockers
+        from latest failed routine attempts.
+        """
+        lines: list[str] = [
+            "## Worker Briefing (Reusable Context)",
+            "Use this as prior context. Verify assumptions against live/browser evidence.",
+        ]
+
+        # Proven artifacts (compact, capped)
+        proven_lines: list[str] = []
+        for fetch in self._ledger.proven.fetches[-5:]:
+            method = self._truncate_text_for_briefing(fetch.get("method", "?"), 24)
+            url = self._truncate_text_for_briefing(fetch.get("url", "?"), 140)
+            proven_lines.append(f"- FETCH: {method} {url}")
+        for nav in self._ledger.proven.navigations[-4:]:
+            url = self._truncate_text_for_briefing(nav.get("url", "?"), 160)
+            proven_lines.append(f"- NAV: {url}")
+        for token in self._ledger.proven.tokens[-4:]:
+            name = self._truncate_text_for_briefing(token.get("name", "?"), 40)
+            source = self._truncate_text_for_briefing(token.get("source", "?"), 120)
+            proven_lines.append(f"- TOKEN: {name} (source: {source})")
+        for param in self._ledger.proven.parameters[-5:]:
+            name = self._truncate_text_for_briefing(param.get("name", "?"), 40)
+            ptype = self._truncate_text_for_briefing(param.get("type", "?"), 24)
+            example = self._truncate_text_for_briefing(param.get("example_value", ""), 90)
+            proven_lines.append(f"- PARAM: {name} ({ptype}) example={example}")
+        if proven_lines:
+            lines.append("\n### Proven Artifacts")
+            lines.extend(proven_lines[:12])
+
+        # Reusable takeaways from confirmed/partial experiments
+        relevant_takeaways: list[tuple[ExperimentEntry, ExperimentTakeaway]] = []
+        for exp in reversed(self._ledger.experiments):
+            if exp.verdict not in {ExperimentVerdict.CONFIRMED, ExperimentVerdict.PARTIAL}:
+                continue
+            if exp.routine_spec_id is not None and routine_spec_id is not None:
+                if exp.routine_spec_id != routine_spec_id:
+                    continue
+            elif exp.routine_spec_id is not None and routine_spec_id is None:
+                # Shared experiments should not inherit routine-specific assumptions by default.
+                continue
+
+            for takeaway in exp.takeaways:
+                relevant_takeaways.append((exp, takeaway))
+                if len(relevant_takeaways) >= 8:
+                    break
+            if len(relevant_takeaways) >= 8:
+                break
+
+        if relevant_takeaways:
+            lines.append("\n### Prior Experiment Takeaways")
+            for exp, takeaway in relevant_takeaways:
+                claim = self._truncate_text_for_briefing(takeaway.claim, 190)
+                tags = ", ".join(takeaway.tags[:4]) if takeaway.tags else ""
+                prefix = f"[{exp.id}]"
+                if tags:
+                    prefix += f" [{tags}]"
+                lines.append(f"- {prefix} {claim}")
+                if takeaway.how_to_apply_next:
+                    how = self._truncate_text_for_briefing(takeaway.how_to_apply_next, 180)
+                    lines.append(f"- apply: {how}")
+                if takeaway.evidence:
+                    ev = self._truncate_text_for_briefing(takeaway.evidence, 180)
+                    lines.append(f"- evidence: {ev}")
+
+        # Latest blockers for this routine spec
+        if routine_spec_id:
+            attempts = self._ledger.get_attempts_for_spec(routine_spec_id)
+            latest_failed = next(
+                (attempt for attempt in reversed(attempts) if attempt.status == RoutineAttemptStatus.FAILED),
+                None,
+            )
+            if latest_failed and latest_failed.blocking_issues:
+                lines.append("\n### Known Blockers (Latest Failed Attempt)")
+                for issue in latest_failed.blocking_issues[:4]:
+                    lines.append(f"- {self._truncate_text_for_briefing(issue, 220)}")
+
+        if len(lines) <= 2:
+            return ""
+        return "\n".join(lines)
+
+    def _compose_worker_methodology(self, methodology: str, routine_spec_id: str | None) -> str:
+        """Compose final worker task methodology with briefing + assigned experiment."""
+        briefing = self._build_worker_briefing(routine_spec_id)
+        if not briefing:
+            return methodology
+        return f"{briefing}\n\n## Assigned Experiment Methodology\n{methodology}"
+
+    @agent_tool(token_optimized=True)
+    def _dispatch_experiment(
+        self,
+        hypothesis: str,
+        rationale: str,
+        methodology: str,
+        output_description: str | None = None,
+    ) -> dict[str, Any]:
+        """
+        Create and dispatch an experiment to a worker.
+
+        The worker has browser tools and capture lookup tools. Write the methodology
+        so the worker knows exactly what to do — reference tools by name.
+
+        Args:
+            hypothesis: What we're testing. Specific and falsifiable.
+            rationale: WHY we're testing this — evidence, reasoning, expectations.
+            methodology: Instructions for the worker. Reference worker tools by name.
+            output_description: Description of expected output.
+        """
+        # Gate: must review docs first
+        if not self._docs_reviewed and self._documentation_data_loader is not None:
+            return {
+                "error": (
+                    "You must review the routine documentation BEFORE dispatching experiments. "
+                    "Call search_files(scope='docs', query='operation', mode='exact') and read_file(scope='docs', path='...') to understand all available "
+                    "operation types (fetch, click, input_text, js_evaluate, get_cookies, "
+                    "download, etc.). This ensures you design experiments that leverage the "
+                    "full routine capabilities."
+                )
+            }
+
+        worker_methodology = self._compose_worker_methodology(
+            methodology=methodology,
+            routine_spec_id=None,
+        )
+        resolved_output_description = output_description or "Structured experiment findings."
+
+        # Create experiment entry
+        experiment = ExperimentEntry(
+            hypothesis=hypothesis,
+            rationale=rationale,
+            methodology=worker_methodology,
+            routine_spec_id=None,
+            status=ExperimentStatus.RUNNING,
+        )
+        self._ledger.add_experiment(experiment)
+
+        # Create and dispatch task
+        task = Task(
+            agent_type=SpecialistAgentType.EXPERIMENT_WORKER,
+            prompt=worker_methodology,
+            max_loops=self._worker_max_loops,
+            output_schema=self.DEFAULT_WORKER_OUTPUT_SCHEMA,
+            output_description=resolved_output_description,
+        )
+        self._orchestration_state.add_task(task)
+        experiment.task_id = task.id
+
+        # Execute immediately
+        result = self._execute_task(task)
+
+        # Update experiment status from task
+        if task.status == TaskStatus.COMPLETED:
+            experiment.status = ExperimentStatus.DONE
+            experiment.output = task.result
+        elif task.status == TaskStatus.FAILED:
+            experiment.status = ExperimentStatus.FAILED
+            experiment.output = {"error": task.error}
+        elif task.status == TaskStatus.PAUSED:
+            experiment.status = ExperimentStatus.RUNNING
+
+        self._persist(f"experiment_{experiment.id}")
+        return {
+            "experiment_id": experiment.id,
+            "task_id": task.id,
+            "status": experiment.status.value,
+            "result": result,
+        }
+
+    @agent_tool(token_optimized=True)
+    def _dispatch_experiments_batch(
+        self,
+        experiments: list[dict[str, Any]],
+    ) -> dict[str, Any]:
+        """
+        Dispatch multiple experiments IN PARALLEL to separate workers.
+
+        Each experiment runs on its own worker with its own browser tab,
+        all executing concurrently. Use this when you have independent
+        experiments that don't depend on each other's results.
+
+        Much faster than calling dispatch_experiment sequentially — N experiments
+        run in roughly the time of 1.
+
+        Args:
+            experiments: List of experiment dicts, each with:
+                - hypothesis: What we're testing (specific and falsifiable)
+                - rationale: WHY we're testing this
+                - methodology: Instructions for the worker (reference tools by name!)
+                - output_description: (optional) Description of expected output
+        """
+        if not experiments:
+            return {"error": "No experiments provided"}
+
+        # Gate: must review docs first
+        if not self._docs_reviewed and self._documentation_data_loader is not None:
+            return {
+                "error": (
+                    "You must review the routine documentation BEFORE dispatching experiments. "
+                    "Call search_files(scope='docs', query='operation', mode='exact') and read_file(scope='docs', path='...') to understand all available "
+                    "operation types (fetch, click, input_text, js_evaluate, get_cookies, "
+                    "download, etc.). This ensures you design experiments that leverage the "
+                    "full routine capabilities."
+                )
+            }
+
+        # Cap at num_workers to avoid overwhelming the system
+        max_parallel = self._num_workers
+        if len(experiments) > max_parallel:
+            logger.warning(
+                "Batch of %d experiments exceeds worker pool (%d), running first %d",
+                len(experiments), max_parallel, max_parallel,
+            )
+            experiments = experiments[:max_parallel]
+
+        # Phase 1: Create all experiment entries and tasks (sequential — fast, no I/O)
+        task_experiment_pairs: list[tuple[Task, ExperimentEntry]] = []
+        for exp_dict in experiments:
+            worker_methodology = self._compose_worker_methodology(
+                methodology=exp_dict.get("methodology", ""),
+                routine_spec_id=None,
+            )
+            resolved_output_description = (
+                exp_dict.get("output_description") or "Structured experiment findings."
+            )
+
+            experiment = ExperimentEntry(
+                hypothesis=exp_dict.get("hypothesis", ""),
+                rationale=exp_dict.get("rationale", ""),
+                methodology=worker_methodology,
+                routine_spec_id=None,
+                status=ExperimentStatus.RUNNING,
+            )
+            self._ledger.add_experiment(experiment)
+
+            task = Task(
+                agent_type=SpecialistAgentType.EXPERIMENT_WORKER,
+                prompt=worker_methodology,
+                max_loops=self._worker_max_loops,
+                output_schema=self.DEFAULT_WORKER_OUTPUT_SCHEMA,
+                output_description=resolved_output_description,
+            )
+            self._orchestration_state.add_task(task)
+            experiment.task_id = task.id
+            task_experiment_pairs.append((task, experiment))
+
+        self._persist("batch_dispatched")
+
+        # Phase 2: Execute all tasks in parallel using ThreadPoolExecutor
+        results: list[dict[str, Any]] = []
+
+        def _run_one(pair: tuple[Task, ExperimentEntry]) -> dict[str, Any]:
+            task, experiment = pair
+            # Create a dedicated worker for this parallel task
+            worker = self._create_worker()
+            subagent = SubAgent(
+                type=task.agent_type,
+                llm_model=self._worker_llm_model.value,
+            )
+            self._orchestration_state.subagents[subagent.id] = subagent
+            self._agent_instances[subagent.id] = worker
+            task.agent_id = subagent.id
+            subagent.task_ids.append(task.id)
+
+            # Wire up real-time thread persistence
+            agent_label = f"worker_{subagent.id}"
+            worker._on_chat_added = lambda _chat: self._dump_agent_thread(agent_label, worker)
+
+            try:
+                task.status = TaskStatus.IN_PROGRESS
+                task.started_at = datetime.now()
+
+                config = AutonomousRunConfig(
+                    min_iterations=1,
+                    max_iterations=task.max_loops,
+                )
+                result = worker.run_autonomous(
+                    task=task.prompt,
+                    config=config,
+                    output_schema=task.output_schema,
+                    output_description=task.output_description,
+                )
+                task.loops_used += worker.autonomous_iteration
+                self._dump_agent_thread(f"worker_{subagent.id}", worker)
+
+                if result is not None:
+                    task.status = TaskStatus.COMPLETED
+                    task.completed_at = datetime.now()
+                    task.result = result.model_dump() if isinstance(result, BaseModel) else result
+                    experiment.status = ExperimentStatus.DONE
+                    experiment.output = task.result
+                else:
+                    task.status = TaskStatus.FAILED
+                    task.error = "Max loops reached without result"
+                    experiment.status = ExperimentStatus.FAILED
+                    experiment.output = {"error": task.error}
+            except Exception as e:
+                task.status = TaskStatus.FAILED
+                task.error = str(e)
+                task.completed_at = datetime.now()
+                experiment.status = ExperimentStatus.FAILED
+                experiment.output = {"error": str(e)}
+                logger.error("Parallel task %s failed: %s", task.id, e)
+            finally:
+                worker.close()
+
+            return {
+                "experiment_id": experiment.id,
+                "hypothesis": experiment.hypothesis[:100],
+                "status": experiment.status.value,
+                "result_preview": str(experiment.output)[:300] if experiment.output else None,
+            }
+
+        with ThreadPoolExecutor(max_workers=max_parallel) as pool:
+            futures = {
+                pool.submit(_run_one, pair): pair
+                for pair in task_experiment_pairs
+            }
+            for future in as_completed(futures, timeout=self.WORKER_TIMEOUT_SECONDS + 30):
+                pair = futures[future]
+                try:
+                    results.append(future.result(timeout=self.WORKER_TIMEOUT_SECONDS))
+                except FuturesTimeoutError:
+                    _, experiment = pair
+                    logger.error(
+                        "Batch experiment %s timed out after %ds",
+                        experiment.id, self.WORKER_TIMEOUT_SECONDS,
+                    )
+                    experiment.status = ExperimentStatus.FAILED
+                    experiment.output = {"error": f"Worker timed out after {self.WORKER_TIMEOUT_SECONDS}s"}
+                    results.append({
+                        "experiment_id": experiment.id,
+                        "status": "failed",
+                        "error": f"Worker timed out after {self.WORKER_TIMEOUT_SECONDS}s",
+                    })
+                except Exception as e:
+                    logger.error("Batch experiment failed: %s", e)
+                    results.append({
+                        "experiment_id": pair[1].id,
+                        "status": "failed",
+                        "error": str(e),
+                    })
+
+        self._persist("batch_completed")
+
+        completed = sum(1 for r in results if r.get("status") == "done")
+        failed = sum(1 for r in results if r.get("status") == "failed")
+
+        return {
+            "total": len(results),
+            "completed": completed,
+            "failed": failed,
+            "experiments": results,
+        }
+
+    def _get_remediation_docs_for_experiment(
+        self, experiment: ExperimentEntry,
+    ) -> str | None:
+        """Scan experiment output for known error patterns and return relevant doc content."""
+        if self._documentation_data_loader is None:
+            return None
+
+        # Build searchable text from output + summary
+        output_text = json.dumps(experiment.output, default=str).lower() if experiment.output else ""
+        summary_text = (experiment.summary or "").lower()
+        searchable = output_text + " " + summary_text
+
+        matched_paths: list[str] = []
+        for keywords, doc_path in self._ERROR_DOC_PATTERNS:
+            if any(kw in searchable for kw in keywords):
+                matched_paths.append(doc_path)
+
+        if not matched_paths:
+            return None
+
+        doc_sections: list[str] = []
+        for doc_path in matched_paths:
+            content = self._documentation_data_loader.get_file_content(doc_path)
+            if content:
+                doc_sections.append(f"--- {doc_path} ---\n{content}")
+
+        if not doc_sections:
+            return None
+
+        return (
+            "IMPORTANT: The following documentation covers known fixes for the "
+            "errors observed in this experiment. Read carefully before deciding "
+            "to give up on this routine spec.\n\n" + "\n\n".join(doc_sections)
+        )
+
+    @agent_tool(token_optimized=True)
+    def _get_experiment_result(self, experiment_id: str) -> dict[str, Any]:
+        """
+        Read the result of a completed experiment.
+
+        Args:
+            experiment_id: ID of the experiment.
+        """
+        experiment = self._ledger.get_experiment(experiment_id)
+        if experiment is None:
+            return {"error": f"No experiment found with ID: {experiment_id}"}
+
+        result: dict[str, Any] = {
+            "experiment_id": experiment.id,
+            "hypothesis": experiment.hypothesis,
+            "routine_spec_id": experiment.routine_spec_id,
+            "status": experiment.status.value,
+            "verdict": experiment.verdict.value if experiment.verdict else None,
+            "summary": experiment.summary,
+            "takeaways": [t.model_dump(mode="json") for t in experiment.takeaways],
+            "output": experiment.output,
+        }
+
+        # Auto-inject relevant common-issues docs when experiment has errors
+        remediation_docs = self._get_remediation_docs_for_experiment(experiment)
+        if remediation_docs:
+            result["remediation_docs"] = remediation_docs
+
+        return result
+
+
+    # ===================================================================
+    # RECORDING TOOLS
+    # ===================================================================
+
+    @agent_tool()
+    def _record_finding(
+        self,
+        experiment_id: str,
+        verdict: str,
+        summary: str,
+        takeaways: list[dict[str, Any]] | None = None,
+    ) -> dict[str, Any]:
+        """
+        Record a verdict after reviewing an experiment result.
+
+        MUST be called for every completed experiment. This builds the
+        experiment log that drives your next decisions.
+
+        Args:
+            experiment_id: ID of the experiment.
+            verdict: One of 'confirmed', 'refuted', 'partial', 'needs_followup'.
+            summary: What we learned, in one or two sentences.
+            takeaways: Optional reusable lessons for future workers.
+                Each item should include:
+                - claim (required): concrete fact to reuse
+                - evidence (optional): supporting detail
+                - how_to_apply_next (optional): instruction for later experiments
+                - confidence (optional): float in [0, 1]
+                - tags (optional): short labels like auth/pagination/endpoint
+        """
+        experiment = self._ledger.get_experiment(experiment_id)
+        if experiment is None:
+            return {"error": f"No experiment found with ID: {experiment_id}"}
+
+        try:
+            experiment.verdict = ExperimentVerdict(verdict)
+        except ValueError:
+            return {
+                "error": f"Invalid verdict: {verdict}. "
+                f"Must be one of: {[v.value for v in ExperimentVerdict]}"
+            }
+
+        experiment.summary = summary
+        if takeaways is not None:
+            parsed_takeaways: list[ExperimentTakeaway] = []
+            for idx, raw_takeaway in enumerate(takeaways):
+                if not isinstance(raw_takeaway, dict):
+                    return {"error": f"takeaways[{idx}] must be an object"}
+                claim = raw_takeaway.get("claim")
+                if not isinstance(claim, str) or not claim.strip():
+                    return {"error": f"takeaways[{idx}].claim is required and must be a non-empty string"}
+
+                confidence = raw_takeaway.get("confidence")
+                if confidence is not None:
+                    try:
+                        confidence = float(confidence)
+                    except (TypeError, ValueError):
+                        return {"error": f"takeaways[{idx}].confidence must be a float in [0, 1]"}
+                    if confidence < 0 or confidence > 1:
+                        return {"error": f"takeaways[{idx}].confidence must be between 0 and 1"}
+
+                tags_raw = raw_takeaway.get("tags", [])
+                if tags_raw is None:
+                    tags_raw = []
+                if not isinstance(tags_raw, list):
+                    return {"error": f"takeaways[{idx}].tags must be a list of strings"}
+
+                tags: list[str] = []
+                for t in tags_raw:
+                    if isinstance(t, str):
+                        tag = t.strip()
+                        if tag:
+                            tags.append(tag)
+                parsed_takeaways.append(
+                    ExperimentTakeaway(
+                        claim=claim.strip(),
+                        evidence=raw_takeaway.get("evidence"),
+                        how_to_apply_next=raw_takeaway.get("how_to_apply_next"),
+                        confidence=confidence,
+                        tags=tags,
+                    )
+                )
+            experiment.takeaways = parsed_takeaways
+
+        self._persist(f"finding_{experiment.id}")
+        return {
+            "experiment_id": experiment.id,
+            "verdict": experiment.verdict.value,
+            "summary": summary,
+            "takeaway_count": len(experiment.takeaways),
+        }
+
+    @agent_tool()
+    def _record_proven_artifact(
+        self,
+        artifact_type: str,
+        details: dict[str, Any],
+    ) -> dict[str, Any]:
+        """
+        Add a proven artifact to the ledger. Call this when an experiment confirms
+        a fetch, navigation, token, or parameter that will be part of a routine.
+
+        IMPORTANT: The 'details' parameter MUST be a JSON object (dict), NOT a string.
+
+        Args:
+            artifact_type: One of 'fetch', 'navigation', 'token', 'parameter'.
+            details: A JSON object with artifact-specific info. NOT a string.
+
+        Example calls:
+            record_proven_artifact({
+                "artifact_type": "fetch",
+                "details": {"url": "https://api.example.com/data", "method": "GET",
+                            "headers": {}, "response_preview": "200 OK with JSON"}
+            })
+            record_proven_artifact({
+                "artifact_type": "navigation",
+                "details": {"url": "https://example.com", "sets_up": ["session_cookie"]}
+            })
+            record_proven_artifact({
+                "artifact_type": "parameter",
+                "details": {"name": "seasonId", "type": "number",
+                            "description": "Season identifier", "example_value": 2025}
+            })
+        """
+        try:
+            atype = ArtifactType(artifact_type)
+        except ValueError:
+            return {
+                "error": f"Invalid artifact_type: {artifact_type}. "
+                f"Must be one of: {[t.value for t in ArtifactType]}"
+            }
+
+        proven = self._ledger.proven
+        if atype == ArtifactType.FETCH:
+            proven.fetches.append(details)
+        elif atype == ArtifactType.NAVIGATION:
+            proven.navigations.append(details)
+        elif atype == ArtifactType.TOKEN:
+            proven.tokens.append(details)
+        elif atype == ArtifactType.PARAMETER:
+            proven.parameters.append(details)
+
+        self._persist(f"artifact_{artifact_type}")
+        return {"ok": True, "artifact_type": artifact_type, "details": details}
+
+    # ===================================================================
+    # ROUTINE SUBMISSION TOOLS
+    # ===================================================================
+
+    @agent_tool(persist=ToolResultPersistMode.ALWAYS, token_optimized=True)
+    def _submit_routine(
+        self,
+        spec_id: str,
+        routine_json: dict[str, Any],
+        test_parameters: dict[str, Any],
+    ) -> dict[str, Any]:
+        """
+        Submit a routine attempt for validation, execution, and inspection.
+
+        Pipeline: validate → execute with test_parameters → inspect → verdict.
+
+        IMPORTANT: You MUST provide test_parameters with realistic values for
+        every parameter defined in the routine. The routine will be executed
+        in a live browser and the result sent to an independent inspector.
+
+        The routine_json MUST match the Routine schema (see system prompt).
+        Key rules:
+        - Use "operations" (not "steps"), each with a "type" discriminator
+        - fetch operations need "endpoint": {"url": "...", "method": "GET"}
+        - Last operation must be "return", "return_html", or "download"
+        - Must have ≥2 operations (navigate + fetch + return is typical)
+        - Use {{paramName}} placeholders in URLs/headers/body for parameters
+        - fetch needs session_storage_key to save results for the return op
+
+        Example:
+            submit_routine({
+                "spec_id": "abc123",
+                "routine_json": {
+                    "name": "search_examplesite_products",
+                    "description": "Searches the ExampleSite product catalog by query string, returning product names, prices, ratings, and availability.",
+                    "parameters": [
+                        {"name": "search_query", "type": "string", "description": "Free-text search query to find products (e.g. 'wireless headphones', 'running shoes')"}
+                    ],
+                    "operations": [
+                        {"type": "navigate", "url": "https://www.example.com"},
+                        {"type": "fetch", "endpoint": {"url": "https://api.example.com/search?q={{search_query}}", "method": "GET"}, "session_storage_key": "result"},
+                        {"type": "return", "session_storage_key": "result"}
+                    ]
+                },
+                "test_parameters": {"search_query": "wireless headphones"}
+            })
+
+        Args:
+            spec_id: Which RoutineSpec this routine fulfills.
+            routine_json: The complete routine dict matching the Routine schema.
+            test_parameters: Parameter values for test execution. REQUIRED —
+                must include a value for every parameter in the routine.
+        """
+        spec = self._ledger.get_spec(spec_id)
+        if spec is None:
+            return {"error": f"No spec found with ID: {spec_id}"}
+
+        if test_parameters is None:
+            return {
+                "error": "test_parameters is required. Provide realistic values "
+                "for every parameter so the routine can be executed and inspected. "
+                "If the routine has no parameters, pass an empty object: {}"
+            }
+
+        # ----- Documentation quality gate (before Pydantic validation) -----
+        doc_issues = self._check_routine_documentation_quality(routine_json)
+        if doc_issues:
+            return {
+                "success": False,
+                "stage": "documentation_quality",
+                "issues": doc_issues,
+                "hint": (
+                    "These routines will be vectorized and stored in databases for other "
+                    "agents to discover and use. Names and descriptions must be precise "
+                    "enough for semantic search and unambiguous enough for autonomous use. "
+                    "Fix the issues above and resubmit."
+                ),
+            }
+
+        # ----- Site-credential parameter gate -----
+        # Reject parameters that look like site-level API keys / subscription keys.
+        # These should be hardcoded from captures, not exposed as user parameters.
+        _CREDENTIAL_PATTERNS = {
+            "api_key", "apikey", "api-key", "subscription_key", "subscriptionkey",
+            "subscription-key", "client_secret", "client_id", "app_key", "appkey",
+            "app_secret", "secret_key", "secretkey", "access_key", "accesskey",
+        }
+        params = routine_json.get("parameters", [])
+        suspect_params = [
+            p["name"] for p in params
+            if isinstance(p, dict) and p.get("name", "").lower().replace("-", "_") in _CREDENTIAL_PATTERNS
+        ]
+        if suspect_params:
+            return {
+                "success": False,
+                "stage": "credential_parameter_check",
+                "suspect_parameters": suspect_params,
+                "error": (
+                    f"Parameter(s) {suspect_params} look like site-level API keys or "
+                    "subscription keys. These are NOT user secrets — they are constants "
+                    "baked into the website's JavaScript or network requests. "
+                    "You MUST: (1) find the actual value from captures using "
+                    "get_recorded_transaction to inspect request headers, "
+                    "(2) hardcode it directly in the routine's headers/body, "
+                    "(3) remove it from parameters. "
+                    "Only parameterize values a USER would naturally provide "
+                    "(search terms, dates, IDs, locations)."
+                ),
+            }
+
+        # Check attempt limit
+        existing_attempts = self._ledger.get_attempts_for_spec(spec_id)
+        if len(existing_attempts) >= self._max_attempts_per_routine:
+            return {
+                "error": f"Max attempts ({self._max_attempts_per_routine}) reached for {spec.name}. "
+                "Consider mark_routine_failed if this routine can't be built."
+            }
+
+        # ----- Duplicate routine check -----
+        # Hash the operations list to detect identical resubmissions.
+        # This prevents wasting inspector tokens on the exact same broken routine.
+        new_ops_hash = hashlib.sha256(
+            json.dumps(routine_json.get("operations", []), sort_keys=True).encode()
+        ).hexdigest()
+        for prev_attempt in existing_attempts:
+            prev_ops_hash = hashlib.sha256(
+                json.dumps(prev_attempt.routine_json.get("operations", []), sort_keys=True).encode()
+            ).hexdigest()
+            if new_ops_hash == prev_ops_hash:
+                prev_issues = prev_attempt.blocking_issues or []
+                return {
+                    "error": (
+                        f"This routine has IDENTICAL operations to attempt #{prev_attempt.id} "
+                        f"which already FAILED inspection. Resubmitting the same routine wastes "
+                        f"tokens and will produce the same result. You MUST change the operations "
+                        f"to address the previous blocking issues before resubmitting."
+                    ),
+                    "previous_blocking_issues": prev_issues,
+                    "hint": (
+                        "Review the blocking issues above. Common fixes: add auth/token "
+                        "operations, add missing headers, fix endpoint URLs, resolve "
+                        "placeholder issues. The routine must be STRUCTURALLY DIFFERENT "
+                        "from previous failed attempts."
+                    ),
+                }
+
+        # Step 1: Validate routine JSON against the Routine model
+        try:
+            routine = Routine.model_validate(routine_json)
+        except ValidationError as e:
+            issues: list[str] = []
+            for err in e.errors():
+                loc = ".".join(str(part) for part in err.get("loc", [])) or "root"
+                msg = err.get("msg", "Invalid value")
+                issues.append(f"{loc}: {msg}")
+            return {
+                "success": False,
+                "stage": "validation",
+                "validation_errors": issues or [str(e)],
+                "issues": issues or [str(e)],
+                "hint": (
+                    "Routine validation failed. BEFORE retrying, use your documentation "
+                    "tools to review the correct schema: call search_files(scope='docs', query='Routine operation "
+                    "endpoint fetch', mode='exact') or read_file(scope='docs', path='...') to read the Routine and operation "
+                    "model source code. Key reminders: each operation needs 'type' "
+                    "(e.g. 'fetch'), fetch operations need 'endpoint': {'url': '...', "
+                    "'method': 'GET'}, last operation must be type 'return' or "
+                    "'return_html' or 'download', and the routine needs at least 2 "
+                    "operations. Check the schema in your system prompt carefully."
+                ),
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "stage": "validation",
+                "validation_errors": [str(e)],
+                "issues": [str(e)],
+                "hint": (
+                    "Routine validation failed. BEFORE retrying, use your documentation "
+                    "tools to review the correct schema and operation requirements."
+                ),
+            }
+
+        # Create attempt record
+        parent_id = existing_attempts[-1].id if existing_attempts else None
+        attempt = RoutineAttempt(
+            routine_spec_id=spec_id,
+            routine_json=json.loads(routine.model_dump_json()),
+            status=RoutineAttemptStatus.VALIDATING,
+            test_parameters=test_parameters,
+            parent_attempt_id=parent_id,
+        )
+        self._ledger.add_attempt(attempt)
+        spec.status = RoutineSpecStatus.VALIDATING
+        self._persist(f"attempt_{attempt.id}_validated")
+        # Persist an initial attempt record before execution/inspection.
+        # The final record for this attempt_id will overwrite this file.
+        self._record_attempt(
+            spec=spec,
+            attempt=attempt,
+            routine_json=routine_json,
+            test_parameters=test_parameters,
+            execution_result=None,
+            inspection_result=None,
+        )
+
+        # Step 2: Execute the routine with test parameters
+        attempt.status = RoutineAttemptStatus.EXECUTING
+        self._persist(f"attempt_{attempt.id}_executing")
+
+        execution_result = self._execute_routine_with_params(routine, test_parameters)
+
+        if execution_result is not None:
+            attempt.execution_result = execution_result.model_dump()
+            if not execution_result.ok:
+                attempt.execution_error = execution_result.error
+                logger.warning(
+                    "Routine %s execution failed: %s", spec.name, execution_result.error,
+                )
+        else:
+            attempt.execution_error = "Execution unavailable (no browser or execution crashed)"
+
+        self._persist(f"attempt_{attempt.id}_executed")
+
+        # Step 3: Send to inspector for quality review
+        attempt.status = RoutineAttemptStatus.INSPECTING
+        self._persist(f"attempt_{attempt.id}_inspecting")
+
+        inspection_result = self._run_inspection(routine, execution_result, spec)
+
+        if inspection_result is not None:
+            # run_autonomous returns a SpecialistResultWrapper dict with the actual
+            # inspection data nested under "output". Unwrap it so downstream code
+            # can read overall_pass / blocking_issues / recommendations directly.
+            inner = inspection_result.get("output", inspection_result)
+            attempt.inspection_result = inner
+            attempt.overall_pass = inner.get("overall_pass", False)
+            attempt.blocking_issues = inner.get("blocking_issues", [])
+            attempt.recommendations = inner.get("recommendations", [])
+
+            if attempt.overall_pass:
+                attempt.status = RoutineAttemptStatus.PASSED
+            else:
+                attempt.status = RoutineAttemptStatus.FAILED
+        else:
+            # Inspector failed — treat as passed with warning (let PI decide)
+            attempt.status = RoutineAttemptStatus.PASSED
+            attempt.recommendations = ["Inspector was unavailable — manual review recommended"]
+
+        self._persist(f"attempt_{attempt.id}_inspected")
+
+        # Build response
+        response: dict[str, Any] = {
+            "success": True,
+            "attempt_id": attempt.id,
+            "spec_id": spec_id,
+            "operations_count": len(routine.operations),
+            "parameters_count": len(routine.parameters),
+        }
+
+        # Prepare execution payload (appended as the LAST response key below)
+        if execution_result is not None:
+            execution_payload: dict[str, Any] = execution_result.model_dump(mode="json")
+        else:
+            execution_payload = {"ok": False, "error": attempt.execution_error}
+
+        # Inspection summary
+        if inspection_result is not None:
+            response["inspection"] = {
+                "overall_pass": attempt.overall_pass,
+                "overall_score": inner.get("overall_score"),
+                "blocking_issues": attempt.blocking_issues,
+                "recommendations": attempt.recommendations,
+                "summary": inner.get("summary"),
+            }
+        else:
+            response["inspection"] = {"overall_pass": None, "note": "Inspector unavailable"}
+
+        response["verdict"] = attempt.status.value
+
+        # ----- Remediation hints for failed inspections -----
+        if not attempt.overall_pass and attempt.blocking_issues:
+            hints: list[str] = []
+            issues_text = " ".join(attempt.blocking_issues).lower()
+            if "failed to fetch" in issues_text or "typeerror" in issues_text:
+                hints.append(
+                    "CORS FIX: Add a 'navigate' operation to the API's allowed origin "
+                    "BEFORE any fetch. Routines start from about:blank (origin=null) so "
+                    "all cross-origin fetches fail. Example: if API is at api.example.com "
+                    "but CORS allows www.example.com, add {\"type\": \"navigate\", "
+                    "\"url\": \"https://www.example.com\"} as the FIRST operation. "
+                    "Review docs: search_files(scope='docs', query='cors-failed-to-fetch', mode='exact')."
+                )
+            if "401" in issues_text or "403" in issues_text or "unauthorized" in issues_text or "access denied" in issues_text:
+                hints.append(
+                    "AUTH FIX: The routine is missing authentication. Add a fetch "
+                    "operation to obtain a token/key, then a js_evaluate to extract "
+                    "it, then include it in subsequent fetch headers via a "
+                    "sessionStorage placeholder. Review docs: "
+                    "search_files(scope='docs', query='unauthenticated', mode='exact')."
+                )
+            if "documentation quality" in issues_text:
+                hints.append(
+                    "DOCS FIX: Improve routine name (verb_site_noun, 3+ segments), "
+                    "description (>=8 words, explain action+inputs+outputs), and "
+                    "parameter descriptions (>=3 words, explain where to get values)."
+                )
+            if hints:
+                response["remediation_hints"] = hints
+
+        # ----- Persist unified attempt record -----
+        # Overwrite the initial record for this attempt with final verdict/results.
+        self._record_attempt(
+            spec=spec,
+            attempt=attempt,
+            routine_json=routine_json,
+            test_parameters=test_parameters,
+            execution_result=execution_result,
+            inspection_result=inspection_result,
+        )
+
+        # Keep execution as the final key for readability in tool responses.
+        response["execution"] = execution_payload
+
+        return response
+
+    @agent_tool()
+    def _mark_routine_shipped(
+        self,
+        spec_id: str,
+        attempt_id: str,
+        when_to_use: str,
+        parameters_summary: list[str] | None = None,
+    ) -> dict[str, Any]:
+        """
+        Mark a routine as shipped after it passes inspection/validation.
+        Moves the spec status to "shipped".
+
+        Args:
+            spec_id: ID of the RoutineSpec.
+            attempt_id: ID of the RoutineAttempt to ship.
+            when_to_use: Guidance for the user on when to use this routine.
+            parameters_summary: Human-readable parameter descriptions.
+        """
+        spec = self._ledger.get_spec(spec_id)
+        if spec is None:
+            return {"error": f"No spec found with ID: {spec_id}"}
+
+        attempt = self._ledger.get_attempt(attempt_id)
+        if attempt is None:
+            return {"error": f"No attempt found with ID: {attempt_id}"}
+
+        spec.status = RoutineSpecStatus.SHIPPED
+        spec.shipped_attempt_id = attempt_id
+
+        self._persist(f"shipped_{spec.name}")
+        return {
+            "ok": True,
+            "shipped": spec.name,
+            "attempt_id": attempt_id,
+        }
+
+    @agent_tool()
+    def _mark_routine_failed(
+        self,
+        spec_id: str,
+        reason: str,
+    ) -> dict[str, Any]:
+        """
+        Give up on a specific routine. Records why it failed.
+
+        Args:
+            spec_id: ID of the RoutineSpec.
+            reason: Why this routine can't be built.
+        """
+        spec = self._ledger.get_spec(spec_id)
+        if spec is None:
+            return {"error": f"No spec found with ID: {spec_id}"}
+
+        # Reject if already failed — stop the PI from looping on the same spec
+        if spec.status == RoutineSpecStatus.FAILED:
+            return {
+                "error": (
+                    f"Routine '{spec.name}' is ALREADY marked as failed. "
+                    "Do not call mark_routine_failed again. Move on to the next "
+                    "unaddressed routine — call get_ledger to see which routines "
+                    "still need work, then dispatch_experiment or submit_routine for those."
+                )
+            }
+
+        # Reject if already shipped
+        if spec.status == RoutineSpecStatus.SHIPPED:
+            return {"error": f"Routine '{spec.name}' is already shipped. Cannot mark as failed."}
+
+        # Global guardrail: require enough failed routine attempts across the pipeline
+        failed_attempts_global = sum(
+            1 for attempt in self._ledger.attempts
+            if attempt.status == RoutineAttemptStatus.FAILED
+        )
+        if failed_attempts_global < self._min_global_failed_attempts_before_routine_failure:
+            return {
+                "error": (
+                    f"Cannot mark routine '{spec.name}' as failed yet. "
+                    f"Global failed routine attempts: {failed_attempts_global}/"
+                    f"{self._min_global_failed_attempts_before_routine_failure} required. "
+                    "Keep iterating: submit improved routine attempts, inspect failures, "
+                    "run experiments to delegate exploration to workers when needed, "
+                    "and only mark routines failed after enough global evidence exists."
+                )
+            }
+
+        # Guardrail: require minimum experimentation before giving up
+        spec_experiments = self._ledger.get_experiments_for_spec(spec_id)
+        if len(spec_experiments) < self.MIN_EXPERIMENTS_BEFORE_ROUTINE_FAILURE:
+            return {
+                "error": (
+                    f"Cannot mark routine '{spec.name}' as failed after only "
+                    f"{len(spec_experiments)} experiment(s). Try at least "
+                    f"{self.MIN_EXPERIMENTS_BEFORE_ROUTINE_FAILURE} experiments "
+                    "with different approaches before giving up. Consider: CDP-level "
+                    "intercepts, direct navigation to API URLs, or checking the "
+                    "captured session data for working request patterns."
+                )
+            }
+
+        spec.status = RoutineSpecStatus.FAILED
+        spec.failure_reason = reason
+
+        self._persist(f"failed_{spec.name}")
+        return {"ok": True, "failed": spec.name, "reason": reason}
+
+    # ===================================================================
+    # DASHBOARD TOOL
+    # ===================================================================
+
+    @agent_tool(token_optimized=True)
+    def _get_ledger(self) -> dict[str, Any]:
+        """
+        Read the full Discovery Ledger — routine specs, experiments, proven
+        artifacts, attempts, and unresolved questions. Use this to review
+        progress and decide what to work on next.
+        """
+        return {
+            "summary": self._ledger.to_summary(),
+            "total_specs": len(self._ledger.routine_specs),
+            "shipped": sum(
+                1 for s in self._ledger.routine_specs
+                if s.status == RoutineSpecStatus.SHIPPED
+            ),
+            "failed": sum(
+                1 for s in self._ledger.routine_specs
+                if s.status == RoutineSpecStatus.FAILED
+            ),
+            "total_experiments": len(self._ledger.experiments),
+            "confirmed": len(self._ledger.get_confirmed_experiments()),
+            "total_attempts": len(self._ledger.attempts),
+            "proven_fetches": len(self._ledger.proven.fetches),
+            "proven_navigations": len(self._ledger.proven.navigations),
+            "proven_tokens": len(self._ledger.proven.tokens),
+            "proven_parameters": len(self._ledger.proven.parameters),
+            "unresolved": self._ledger.unresolved,
+        }
+
+    # ===================================================================
+    # TERMINATION TOOLS
+    # ===================================================================
+
+    @agent_tool()
+    def _mark_complete(self, usage_guide: str) -> dict[str, Any]:
+        """
+        Signal that the pipeline is done. Call this when ALL routines
+        have been addressed (shipped or failed).
+
+        Provides a usage_guide string explaining how to use the routines
+        together and when to use each one. Builds the final RoutineCatalog.
+
+        Args:
+            usage_guide: How to use these routines together. Include:
+                - What each routine does
+                - When to use each one
+                - How they relate to each other
+                - What parameters each expects
+        """
+        # Guardrail: reject if routines are still unaddressed
+        unaddressed = [
+            s for s in self._ledger.routine_specs
+            if s.status not in (RoutineSpecStatus.SHIPPED, RoutineSpecStatus.FAILED)
+        ]
+        shipped_count = sum(
+            1 for s in self._ledger.routine_specs
+            if s.status == RoutineSpecStatus.SHIPPED
+        )
+        if unaddressed:
+            unaddressed_names = [f"{s.name} ({s.status.value})" for s in unaddressed]
+            return {
+                "error": (
+                    f"Cannot mark complete — {len(unaddressed)} routine(s) are still unaddressed: "
+                    f"{', '.join(unaddressed_names)}. "
+                    "Each routine must be either shipped (via submit_routine → mark_routine_shipped) "
+                    "or explicitly failed (via mark_routine_failed) before calling mark_complete. "
+                    "You must build and submit actual routine JSON with test_parameters for each routine."
+                )
+            }
+
+        # Guardrail: reject if nothing was shipped at all
+        if shipped_count == 0:
+            return {
+                "error": (
+                    "Cannot mark complete with 0 shipped routines. At least one routine "
+                    "must be successfully built, submitted, and shipped. Use submit_routine "
+                    "with a complete routine_json and test_parameters to create routine attempts."
+                )
+            }
+
+        catalog = self._build_catalog(usage_guide)
+        self._ledger.catalog = catalog
+        self._pipeline_result = catalog
+        self._is_done = True
+
+        self._persist("complete")
+        return {
+            "status": "complete",
+            "routines_shipped": len(catalog.routines),
+            "routines_failed": len(catalog.failed_routines),
+            "total_experiments": catalog.total_experiments,
+            "total_attempts": catalog.total_attempts,
+        }
+
+    @agent_tool()
+    def _mark_failed(self, reason: str) -> dict[str, Any]:
+        """
+        Signal that the pipeline has failed — can't build ANY routines at all.
+
+        Args:
+            reason: Why construction failed entirely.
+        """
+        # Guardrail: prevent premature pipeline abandonment
+        total_experiments = len(self._ledger.experiments)
+        unaddressed_specs = [
+            s for s in self._ledger.routine_specs
+            if s.status not in (RoutineSpecStatus.SHIPPED, RoutineSpecStatus.FAILED)
+        ]
+        if total_experiments < self._min_experiments_before_fail and unaddressed_specs:
+            return {
+                "error": (
+                    f"Cannot mark pipeline as failed after only {total_experiments} experiment(s). "
+                    f"You have {len(unaddressed_specs)} unaddressed routine(s). "
+                    "Try alternative approaches: use search_recorded_transactions to find working "
+                    "request patterns, use browser_cdp_command for CDP-level intercepts, or "
+                    "navigate directly to API URLs. Mark individual routines as failed with "
+                    "mark_routine_failed if they truly can't be built, then call mark_complete."
+                )
+            }
+
+        self._is_done = True
+        self._pipeline_result = None
+        logger.warning("PI marked pipeline as failed: %s", reason)
+
+        self._persist("failed")
+        return {"status": "failed", "reason": reason}
+
+    # ===================================================================
+    # Internal — catalog building
+    # ===================================================================
+
+    def _build_catalog(self, usage_guide: str) -> RoutineCatalog:
+        """Build a RoutineCatalog from the current ledger state."""
+        shipped_routines: list[ShippedRoutine] = []
+        failed_routines: list[dict[str, Any]] = []
+
+        for spec in self._ledger.routine_specs:
+            if spec.status == RoutineSpecStatus.SHIPPED and spec.shipped_attempt_id:
+                attempt = self._ledger.get_attempt(spec.shipped_attempt_id)
+                if attempt:
+                    routine_name = attempt.routine_json.get("name") or spec.name
+                    routine_description = attempt.routine_json.get("description") or spec.description
+                    shipped_routines.append(ShippedRoutine(
+                        routine_spec_id=spec.id,
+                        routine_json=attempt.routine_json,
+                        name=routine_name,
+                        description=routine_description,
+                        when_to_use=f"Use to {routine_description.lower()}",
+                        parameters_summary=[],
+                        inspection_score=attempt.inspection_result.get("overall_score", 0)
+                        if attempt.inspection_result else 0,
+                    ))
+            elif spec.status == RoutineSpecStatus.FAILED:
+                failed_routines.append({
+                    "name": spec.name,
+                    "description": spec.description,
+                    "reason": spec.failure_reason or "Unknown",
+                })
+
+        # Infer site from exploration summaries or first URL
+        site = "unknown"
+        for summary_text in self._exploration_summaries_raw.values():
+            if "://" in summary_text:
+                # Try to extract domain
+                match = re.search(r'https?://([^/\s]+)', summary_text)
+                if match:
+                    site = match.group(1)
+                    break
+
+        return RoutineCatalog(
+            site=site,
+            user_task=self._task,
+            routines=shipped_routines,
+            usage_guide=usage_guide,
+            failed_routines=failed_routines,
+            total_experiments=len(self._ledger.experiments),
+            total_attempts=len(self._ledger.attempts),
+        )
+
+    def _build_partial_catalog(self) -> RoutineCatalog | None:
+        """Build a partial catalog from whatever has been shipped so far."""
+        shipped = [
+            s for s in self._ledger.routine_specs
+            if s.status == RoutineSpecStatus.SHIPPED
+        ]
+        if not shipped:
+            return None
+        return self._build_catalog(
+            "Pipeline hit iteration limit. These routines were completed."
+        )
+
+    # ===================================================================
+    # Internal — worker management
+    # ===================================================================
+
+    def _create_worker(self) -> ExperimentWorker:
+        """Create a new ExperimentWorker instance with all available context."""
+        worker_workspace = (
+            self._worker_workspace_factory()
+            if self._worker_workspace_factory is not None
+            else None
+        )
+        return ExperimentWorker(
+            emit_message_callable=self._emit_message_callable,
+            # Browser context
+            remote_debugging_address=self._remote_debugging_address,
+            # Capture data loaders
+            network_data_loader=self._network_data_loader,
+            storage_data_loader=self._storage_data_loader,
+            dom_data_loader=self._dom_data_loader,
+            window_property_data_loader=self._window_property_data_loader,
+            # Config
+            llm_model=self._worker_llm_model,
+            workspace=worker_workspace,
+        )
+
+    def _create_inspector(self) -> RoutineInspector:
+        """Create a new RoutineInspector instance."""
+        inspector_workspace = (
+            self._inspector_workspace_factory()
+            if self._inspector_workspace_factory is not None
+            else None
+        )
+        return RoutineInspector(
+            emit_message_callable=self._emit_message_callable,
+            llm_model=self._worker_llm_model,
+            documentation_data_loader=self._documentation_data_loader,
+            workspace=inspector_workspace,
+        )
+
+    def _get_or_create_agent(self, task: Task) -> AbstractAgent:
+        """
+        Get existing agent instance or create/reuse one for the task.
+
+        Workers are capped at num_workers. Once the pool is full, new tasks
+        are assigned round-robin to existing workers (each gets a fresh
+        autonomous run but the PI can still dispatch new experiments to the same worker).
+        """
+        if task.agent_id and task.agent_id in self._agent_instances:
+            return self._agent_instances[task.agent_id]
+
+        # Check if we can reuse an existing worker (pool is full)
+        worker_ids = [
+            sid for sid, agent in self._agent_instances.items()
+            if isinstance(agent, ExperimentWorker)
+        ]
+
+        if len(worker_ids) >= self._num_workers:
+            # Round-robin to existing workers
+            reuse_id = worker_ids[self._worker_counter % len(worker_ids)]
+            self._worker_counter += 1
+            task.agent_id = reuse_id
+            subagent = self._orchestration_state.subagents.get(reuse_id)
+            if subagent:
+                subagent.task_ids.append(task.id)
+            # Close old browser tab — _ensure_browser will create a fresh one
+            worker = self._agent_instances[reuse_id]
+            if isinstance(worker, ExperimentWorker):
+                worker.close()
+            return worker
+
+        # Create new worker
+        agent = self._create_worker()
+
+        subagent = SubAgent(
+            type=task.agent_type,
+            llm_model=self._worker_llm_model.value,
+        )
+        self._orchestration_state.subagents[subagent.id] = subagent
+        self._agent_instances[subagent.id] = agent
+
+        task.agent_id = subagent.id
+        subagent.task_ids.append(task.id)
+
+        # Wire up real-time thread persistence
+        agent_label = f"worker_{subagent.id}"
+        agent._on_chat_added = lambda _chat: self._dump_agent_thread(agent_label, agent)
+
+        return agent
+
+    def _get_or_create_inspector(self) -> RoutineInspector:
+        """
+        Get an existing inspector instance or create one.
+
+        Inspectors are capped at num_inspectors. Once the pool is full,
+        existing inspectors are reused round-robin (each gets a fresh
+        autonomous run via reset).
+        """
+        inspector_ids = [
+            sid for sid, agent in self._agent_instances.items()
+            if isinstance(agent, RoutineInspector)
+        ]
+
+        if len(inspector_ids) < self._num_inspectors:
+            # Pool not full — create a new inspector
+            inspector = self._create_inspector()
+            subagent = SubAgent(
+                type=SpecialistAgentType.ROUTINE_INSPECTOR,
+                llm_model=self._worker_llm_model.value,
+            )
+            self._orchestration_state.subagents[subagent.id] = subagent
+            self._agent_instances[subagent.id] = inspector
+            # Wire up real-time thread persistence
+            inspector_label = f"inspector_{subagent.id}"
+            inspector._on_chat_added = lambda _chat: self._dump_agent_thread(inspector_label, inspector)
+            return inspector
+
+        # Pool full — reuse round-robin with fresh conversation
+        reuse_id = inspector_ids[self._inspector_counter % len(inspector_ids)]
+        self._inspector_counter += 1
+        inspector = self._agent_instances[reuse_id]
+        assert isinstance(inspector, RoutineInspector)
+        inspector.reset()
+        return inspector
+
+    # ===================================================================
+    # Internal — routine execution and inspection
+    # ===================================================================
+
+    def _execute_routine_with_params(
+        self,
+        routine: Routine,
+        test_parameters: dict[str, Any] | None,
+    ) -> RoutineExecutionResultWithMetadata | None:
+        """Execute a routine with test parameters in a live browser."""
+        if not self._remote_debugging_address:
+            logger.warning("No remote_debugging_address — skipping routine execution")
+            return None
+
+        try:
+            result = routine.execute(
+                parameters_dict=test_parameters,
+                remote_debugging_address=self._remote_debugging_address,
+                timeout=120.0,
+                close_tab_when_done=True,
+                incognito=True,
+            )
+            return result
+        except Exception as e:
+            logger.error("Routine execution failed: %s", e)
+            return None
+
+    def _run_inspection(
+        self,
+        routine: Routine,
+        execution_result: RoutineExecutionResultWithMetadata | None,
+        spec: RoutineSpec,
+    ) -> dict[str, Any] | None:
+        """Run a RoutineInspector on a routine + execution result."""
+        inspector = self._get_or_create_inspector()
+
+        # Build inspection prompt with all context
+        # NOTE: User task is intentionally excluded — the inspector should judge
+        # the routine on its own merits (correctness, robustness, data quality),
+        # not whether it fulfills the user's high-level goal.
+        prompt_parts: list[str] = [
+            f"## Routine Name\n{routine.name}\n",
+            f"## Routine Description\n{routine.description}\n",
+            f"## Routine JSON\n```json\n{json.dumps(routine.model_dump(), indent=2, default=str)}\n```\n",
+        ]
+
+        if execution_result is not None:
+            exec_data = execution_result.model_dump(mode="json")
+            exec_json = json.dumps(exec_data, indent=2, default=str)
+
+            persisted_for_inspector = False
+            if (
+                len(exec_json) > self.INSPECTOR_INLINE_EXECUTION_MAX_CHARS
+                and inspector.has_workspace
+            ):
+                try:
+                    inspector_workspace = inspector._require_workspace()
+                    inspector_workspace.ensure_dirs()
+                    safe_spec = re.sub(r"[^a-zA-Z0-9_.-]+", "_", spec.name).strip("_")
+                    if not safe_spec:
+                        safe_spec = spec.id
+                    artifact_ref = inspector_workspace.save_artifact(
+                        source="raw",
+                        filename=f"{safe_spec}_execution_result.json",
+                        content=exec_json,
+                        tool_name="pi_run_inspection",
+                        content_type="json",
+                        metadata={
+                            "spec_id": spec.id,
+                            "spec_name": spec.name,
+                            "char_count": len(exec_json),
+                        },
+                    )
+                    prompt_parts.append(
+                        "## Execution Result\n"
+                        f"Execution payload is large ({len(exec_json)} chars) and was saved to:\n"
+                        f"- workspace path: `{artifact_ref.relative_path}`\n"
+                        f"- artifact_id: `{artifact_ref.artifact_id}`\n\n"
+                        "Use `execute_python` or `read_file(scope=\"workspace\", path=\"...\")` to inspect "
+                        "this file directly and base your judgment on the full payload.\n"
+                        "Do not claim truncation — the full execution result is available in workspace raw/.\n"
+                    )
+                    persisted_for_inspector = True
+                except Exception as e:
+                    logger.warning(
+                        "Failed to persist large execution payload for inspector; falling back to inline JSON: %s",
+                        e,
+                    )
+
+            if not persisted_for_inspector:
+                prompt_parts.append(
+                    f"## Execution Result\n```json\n{exec_json}\n```\n"
+                )
+        else:
+            prompt_parts.append("## Execution Result\nNot available (no browser or execution failed).\n")
+
+        base_prompt = "\n".join(prompt_parts)
+
+        # Add exploration summaries for cross-reference. If the combined prompt gets too
+        # large, drop exploration summaries first. Do NOT truncate execution payload.
+        exploration_section = ""
+        if self._exploration_summaries:
+            exploration_parts: list[str] = ["## Exploration Summaries\n"]
+            for domain, summary in self._exploration_summaries.items():
+                exploration_parts.append(f"### {domain}\n{summary}\n")
+            exploration_section = "\n".join(exploration_parts)
+
+        inspection_prompt = (
+            f"{base_prompt}\n\n{exploration_section}"
+            if exploration_section
+            else base_prompt
+        )
+
+        max_chars = 120_000
+        if len(inspection_prompt) > max_chars and exploration_section:
+            logger.warning(
+                "Inspection prompt too large (%d chars); omitting exploration summaries to preserve full execution payload",
+                len(inspection_prompt),
+            )
+            inspection_prompt = (
+                f"{base_prompt}\n\n"
+                "## Exploration Summaries\n"
+                "[omitted due prompt size; consult persisted exploration artifacts if needed]\n"
+            )
+
+        if len(inspection_prompt) > max_chars:
+            logger.warning(
+                "Inspection prompt still large after omitting summaries (%d chars); sending full routine + execution payload without truncation",
+                len(inspection_prompt),
+            )
+
+        try:
+            config = AutonomousRunConfig(min_iterations=1, max_iterations=10)
+
+            # Run inspector with timeout to prevent indefinite hangs
+            with ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(
+                    inspector.run_autonomous,
+                    task=inspection_prompt,
+                    config=config,
+                    output_schema=RoutineInspectionResult.model_json_schema(),
+                    output_description="RoutineInspectionResult with scores, blocking issues, and verdict",
+                )
+                try:
+                    result = future.result(timeout=self.WORKER_TIMEOUT_SECONDS)
+                except FuturesTimeoutError:
+                    logger.error(
+                        "Inspector timed out for %s after %ds", spec.name, self.WORKER_TIMEOUT_SECONDS,
+                    )
+                    self._dump_agent_thread(f"inspector_{spec.name}", inspector)
+                    return None
+
+            # Dump inspector thread
+            self._dump_agent_thread(f"inspector_{spec.name}", inspector)
+
+            if result is not None:
+                return result.model_dump() if isinstance(result, BaseModel) else result
+            return None
+        except Exception as e:
+            logger.error("Inspection failed for %s: %s", spec.name, e)
+            return None
+
+    def _execute_task(self, task: Task) -> dict[str, Any]:
+        """Execute a task using an ExperimentWorker with a timeout guard."""
+        task.status = TaskStatus.IN_PROGRESS
+        task.started_at = datetime.now()
+
+        try:
+            agent = self._get_or_create_agent(task)
+
+            remaining_loops = task.max_loops - task.loops_used
+            if remaining_loops <= 0:
+                task.status = TaskStatus.FAILED
+                task.error = "No loops remaining"
+                return {"success": False, "error": "No loops remaining"}
+
+            config = AutonomousRunConfig(
+                min_iterations=1,
+                max_iterations=remaining_loops,
+            )
+
+            # Run with timeout to prevent indefinite hangs (LLM or browser)
+            with ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(
+                    agent.run_autonomous,
+                    task=task.prompt,
+                    config=config,
+                    output_schema=task.output_schema,
+                    output_description=task.output_description,
+                )
+                try:
+                    result = future.result(timeout=self.WORKER_TIMEOUT_SECONDS)
+                except FuturesTimeoutError:
+                    logger.error(
+                        "Task %s timed out after %ds", task.id, self.WORKER_TIMEOUT_SECONDS,
+                    )
+                    task.status = TaskStatus.FAILED
+                    task.error = f"Worker timed out after {self.WORKER_TIMEOUT_SECONDS}s"
+                    task.completed_at = datetime.now()
+                    self._dump_agent_thread(f"worker_{task.agent_id}", agent)
+                    return {"success": False, "error": task.error}
+
+            task.loops_used += agent.autonomous_iteration
+
+            # Dump the worker's full message history for debugging
+            self._dump_agent_thread(f"worker_{task.agent_id}", agent)
+
+            if result is not None:
+                task.status = TaskStatus.COMPLETED
+                task.completed_at = datetime.now()
+                task.result = result.model_dump() if isinstance(result, BaseModel) else result
+                return {"success": True, "result": task.result}
+            else:
+                if task.loops_used < task.max_loops:
+                    task.status = TaskStatus.PAUSED
+                    return {"success": False, "status": "paused", "loops_used": task.loops_used}
+                else:
+                    task.status = TaskStatus.FAILED
+                    task.error = "Max loops reached without result"
+                    return {"success": False, "error": task.error}
+
+        except Exception as e:
+            task.status = TaskStatus.FAILED
+            task.error = str(e)
+            task.completed_at = datetime.now()
+            logger.error("Task %s failed: %s", task.id, e)
+            return {"success": False, "error": str(e)}
+
+    # ===================================================================
+    # Internal — documentation quality checks
+    # ===================================================================
+
+    @staticmethod
+    def _check_routine_documentation_quality(routine_json: dict[str, Any]) -> list[str]:
+        """
+        Validate that routine metadata is detailed enough for vectorized storage
+        and discovery by other agents. Returns a list of issues (empty = pass).
+        """
+        issues: list[str] = []
+
+        # --- Routine name ---
+        name = routine_json.get("name", "")
+        if not name:
+            issues.append("Routine name is missing.")
+        else:
+            # Must be snake_case (lowercase + underscores)
+            if not re.match(r'^[a-z][a-z0-9]*(_[a-z0-9]+)*$', name):
+                issues.append(
+                    f"Routine name '{name}' must be snake_case (e.g. 'get_premierleague_standings', "
+                    "'search_amtrak_trains'). No camelCase, no spaces, no uppercase."
+                )
+            # Must be descriptive — at least 3 underscore-separated segments
+            # (verb + site/context + noun, e.g. get_premierleague_standings)
+            segments = name.split("_")
+            if len(segments) < 3:
+                issues.append(
+                    f"Routine name '{name}' needs more context ({len(segments)} segments, need ≥3). "
+                    "The name must include the site/service so it makes sense in isolation. "
+                    "Pattern: verb_site_noun (e.g. 'get_premierleague_standings', "
+                    "'search_espn_scores', 'fetch_amtrak_schedules'). "
+                    "Another agent reading ONLY the name should know what site this targets."
+                )
+            # Reject overly generic names that lack site context
+            _GENERIC_NOUNS = {
+                "data", "items", "item", "content", "results", "result",
+                "info", "details", "list", "response", "output", "records",
+            }
+            # Check if the non-verb segments are all generic
+            non_verb_segments = segments[1:] if len(segments) > 1 else []
+            if non_verb_segments and all(seg in _GENERIC_NOUNS for seg in non_verb_segments):
+                issues.append(
+                    f"Routine name '{name}' uses only generic nouns ({non_verb_segments}). "
+                    "Include the site/domain name and a specific noun. "
+                    "Example: 'get_content_item' → 'get_premierleague_article', "
+                    "'fetch_data' → 'fetch_espn_game_scores'."
+                )
+
+        # --- Routine description ---
+        desc = routine_json.get("description", "")
+        if not desc:
+            issues.append("Routine description is missing.")
+        else:
+            word_count = len(desc.split())
+            if word_count < 8:
+                issues.append(
+                    f"Routine description is too short ({word_count} words). Must be ≥8 words. "
+                    "Describe: what the routine does, what inputs it takes, and what data it returns. "
+                    "Example: 'Fetches Premier League standings for a given competition and season, "
+                    "returning team names, positions, wins, draws, losses, and points.'"
+                )
+            # Should mention what it returns
+            desc_lower = desc.lower()
+            return_keywords = ("return", "fetch", "retriev", "get", "extract", "download", "output", "produc", "yield")
+            if not any(kw in desc_lower for kw in return_keywords):
+                issues.append(
+                    "Routine description should explain what data it returns. "
+                    "Include words like 'returns', 'fetches', 'retrieves', or 'extracts'."
+                )
+
+        # --- Parameter descriptions ---
+        # Suffixes/keywords that signal an opaque, non-obvious parameter value
+        _OPAQUE_SIGNALS = ("_id", "_ids", "_slug", "_code", "_token", "_key", "_hash", "_uuid")
+        _SOURCE_KEYWORDS = (
+            "obtain", "from the", "get from", "found in", "returned by",
+            "use the", "listed by", "provided by", "available via", "see the",
+            "look up", "call the", "via the", "endpoint", "routine",
+        )
+
+        params = routine_json.get("parameters", [])
+        for param in params:
+            if not isinstance(param, dict):
+                continue
+            pname = param.get("name", "unknown")
+            pdesc = param.get("description", "")
+            if not pdesc:
+                issues.append(f"Parameter '{pname}' is missing a description.")
+                continue
+
+            if len(pdesc.split()) < 3:
+                issues.append(
+                    f"Parameter '{pname}' description is too terse: '{pdesc}'. "
+                    "Descriptions must be ≥3 words and explain what the value represents "
+                    "and its expected format (e.g. 'The unique season identifier, typically a 4-digit year like 2024')."
+                )
+                continue
+
+            # Check if this looks like an opaque/non-obvious parameter
+            pname_lower = pname.lower()
+            ptype = param.get("type", "string")
+            is_opaque = (
+                any(pname_lower.endswith(sig) for sig in _OPAQUE_SIGNALS)
+                or ptype in ("integer", "number") and pname_lower.endswith("id")
+            )
+
+            if is_opaque:
+                pdesc_lower = pdesc.lower()
+                has_source = any(kw in pdesc_lower for kw in _SOURCE_KEYWORDS)
+                if not has_source:
+                    issues.append(
+                        f"Parameter '{pname}' looks like an opaque/internal identifier but its "
+                        f"description doesn't explain WHERE to get valid values. "
+                        f"Current description: '{pdesc}'. "
+                        "For non-obvious IDs, slugs, and codes, the description MUST say how "
+                        "to obtain valid values — e.g. 'Obtain from the get_competitions routine' "
+                        "or 'Found in the /api/seasons endpoint response'."
+                    )
+
+        return issues
+
+    def close(self) -> None:
+        """Clean up all worker agent instances."""
+        for agent in self._agent_instances.values():
+            if hasattr(agent, "close"):
+                try:
+                    agent.close()
+                except Exception:
+                    pass
+        self._agent_instances.clear()
diff --git a/bluebox/agents/routine_discovery_agent_beta.py b/bluebox/agents/routine_discovery_agent_beta.py
deleted file mode 100644
index 8eae08d9..00000000
--- a/bluebox/agents/routine_discovery_agent_beta.py
+++ /dev/null
@@ -1,2157 +0,0 @@
-"""
-bluebox/agents/routine_discovery_agent_beta.py
-
-RoutineDiscoveryAgentBeta - orchestrator for routine discovery.
-
-This agent coordinates specialist subagents (JSSpecialist, NetworkSpecialist, etc.)
-to discover routines from CDP captures. It delegates specific tasks to specialists
-while managing the overall discovery workflow:
-
-1. PLANNING: Analyze task, plan approach
-2. DISCOVERING: Delegate discovery tasks to specialists
-3. CONSTRUCTING: Build routine from discoveries
-4. VALIDATING: Test the constructed routine
-5. COMPLETE/FAILED: Finish discovery
-
-The agent inherits from AbstractAgent for LLM/chat/tool infrastructure.
-"""
-
-from __future__ import annotations
-
-import json
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from datetime import datetime
-from textwrap import dedent
-from typing import Any, Callable
-
-from pydantic import BaseModel
-
-from bluebox.agents.abstract_agent import AbstractAgent, AgentCard, agent_tool
-from bluebox.agents.specialists.abstract_specialist import AbstractSpecialist, AutonomousConfig, RunMode
-from bluebox.agents.specialists.js_specialist import JSSpecialist
-from bluebox.agents.specialists.network_specialist import NetworkSpecialist
-from bluebox.agents.specialists.value_trace_resolver_specialist import ValueTraceResolverSpecialist
-from bluebox.agents.specialists.interaction_specialist import InteractionSpecialist
-from bluebox.data_models.llms.interaction import (
-    Chat,
-    ChatRole,
-    ChatThread,
-    EmittedMessage,
-    ChatResponseEmittedMessage,
-    ErrorEmittedMessage,
-)
-from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
-from bluebox.data_models.orchestration.task import Task, SubAgent, TaskStatus, SpecialistAgentType
-from bluebox.data_models.orchestration.state import AgentOrchestrationState
-from bluebox.data_models.routine.endpoint import HTTPMethod
-from bluebox.data_models.routine.routine import Routine
-from bluebox.data_models.routine_discovery.state import RoutineDiscoveryState, DiscoveryPhase
-from bluebox.data_models.routine_discovery.llm_responses import (
-    TransactionIdentificationResponse,
-    Variable,
-    VariableType,
-    ExtractedVariableResponse,
-    ResolvedVariableResponse,
-    SessionStorageSource,
-    TransactionSource,
-    WindowPropertySource,
-    SessionStorageType,
-)
-from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader
-from bluebox.llms.data_loaders.interactions_data_loader import InteractionsDataLoader
-from bluebox.llms.data_loaders.js_data_loader import JSDataLoader
-from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
-from bluebox.llms.data_loaders.storage_data_loader import StorageDataLoader
-from bluebox.llms.data_loaders.window_property_data_loader import WindowPropertyDataLoader
-from bluebox.utils.data_utils import resolve_dotted_path
-from bluebox.utils.logger import get_logger
-
-logger = get_logger(name=__name__)
-
-
-class RoutineDiscoveryAgentBeta(AbstractAgent):
-    """
-    Orchestrator agent that coordinates specialist subagents for routine discovery.
-
-    Unlike specialists which do focused work, this agent plans and delegates:
-    - Creates tasks for specialists to handle
-    - Runs tasks and collects results
-    - Uses results to construct routines
-    """
-
-    AGENT_CARD = AgentCard(
-        description=(
-            "Orchestrates routine discovery by coordinating specialist subagents. "
-            "Delegates network analysis, value tracing, JS generation, and interaction "
-            "analysis to specialists, then assembles the results into a routine."
-        ),
-    )
-
-    ## System prompts — phase-scoped sections
-
-    # Core identity + delegation rules (included in every phase)
-    PROMPT_CORE: str = dedent("""\
-        You are an expert at analyzing network traffic and building web automation routines.
-        You coordinate specialist agents to discover and construct routines.
-
-        ## Your Task
-        Analyze captured browser network data to create a reusable routine that accomplishes the user's task.
-
-        ## CRITICAL: You MUST Delegate to Specialists
-
-        **DO NOT** try to do everything yourself with direct tools. You are an ORCHESTRATOR.
-        Your job is to coordinate specialists, not to manually inspect every transaction.
-
-        **How to delegate:**
-        1. `create_task(agent_type="network_specialist", prompt="...")`
-        2. `run_pending_tasks()`
-        3. `get_task_result(task_id)` to review findings
-
-        ## Important Notes
-        - Focus on the user's INTENT, not literal wording
-        - Keep parameters MINIMAL - only what the user MUST provide
-        - If only one value was observed and it could be hardcoded, hardcode it
-        - Credentials for fetch operations: same-origin > include > omit
-    """)
-
-    # Phase-specific instructions (only the active phase's block is included)
-    PROMPT_PLANNING: str = dedent("""\
-        ## Current Phase: PLANNING — Identify the Target Endpoint
-
-        1. **REQUIRED**: Create a task for network_specialist to find the endpoint:
-           ```
-           create_task(
-               agent_type="network_specialist",
-               prompt="Find the API endpoint that accomplishes: <user's task>. Search for relevant keywords."
-           )
-           ```
-        2. Call `run_pending_tasks()` to execute
-        3. Review results with `get_task_result(task_id)`
-        4. Use `record_identified_endpoint` with the specialist's findings
-    """)
-
-    PROMPT_DISCOVERING: str = dedent("""\
-        ## Current Phase: DISCOVERING — Process Transactions (BFS Queue)
-
-        For each transaction in the queue:
-        1. Use `get_transaction` to see full details
-        2. Use `record_extracted_variable` to log variables found in the request
-        3. **For DYNAMIC_TOKENs — DELEGATE TO value_trace_resolver**:
-           ```
-           create_task(
-               agent_type="value_trace_resolver",
-               prompt="Trace the origin of value '<observed_value>' (variable: <name>). Find where it comes from."
-           )
-           ```
-        4. Call `run_pending_tasks()` then `get_task_result(task_id)` to get findings
-        5. Use `record_resolved_variable` to record where each token comes from
-           - If source is another transaction, it will be auto-added to the queue
-           - PREFER NETWORK SOURCES: When a value appears in both session storage AND a prior
-             transaction response, use source_type='transaction' as the PRIMARY source.
-             Session storage may be empty in a fresh session.
-        6. Use `mark_transaction_processed` when done with a transaction
-        7. Continue until queue is empty
-
-        ## Variable Classification Rules
-
-        **PARAMETER** (requires_dynamic_resolution=false):
-        - Values the user explicitly provides as input
-        - Examples: search_query, item_id, page_number, username
-        - Rule: If the user wouldn't directly provide this value, it's NOT a parameter
-
-        **DYNAMIC_TOKEN** (requires_dynamic_resolution=true):
-        - Auth/session values that change per session
-        - Examples: CSRF tokens, JWTs, session_id, visitorData, auth headers
-        - Also: trace IDs, request IDs, correlation IDs
-        - Rule: If it looks like a generated ID or security token, it's a DYNAMIC_TOKEN
-
-        **STATIC_VALUE** (requires_dynamic_resolution=false):
-        - Constants that don't change between sessions
-        - Examples: App version, User-Agent, clientName, timeZone, language codes
-        - Rule: If you can hardcode it and it will work across sessions, it's STATIC
-    """)
-
-    PROMPT_CONSTRUCTING: str = dedent("""\
-        ## Current Phase: CONSTRUCTING — Build the Routine
-
-        1. Use `get_discovery_context` to see all processed data (includes CRITICAL_OBSERVED_VALUES)
-        2. Review the **Routine Schema Reference** below for required fields and operation types
-        3. Use `construct_routine` with the routine definition:
-           - `routine`: the routine definition (name, description, parameters, operations)
-
-        **If browser is connected (validation available):**
-        4. After constructing, use `validate_routine` with test_parameters (observed values)
-        5. Use `analyze_validation` to reflect on results (REQUIRED before done)
-
-        **If NO browser connected:**
-        4. Call `done` directly after construct_routine
-
-        ## Operation Ordering
-
-        Routines typically start with a `navigate` operation to load the target page before
-        performing any other operations. This is important because:
-        - `fetch` operations run in the page's JS context — without navigating first, the
-          browser has no origin, so requests fail with CORS errors.
-        - Similar situations apply to `js_evaluate`. 
-        - Click/input/scroll operations need a loaded DOM to interact with.
-
-        Look at the root transaction's URL to determine the base URL to navigate to (usually
-        the origin, e.g. `https://example.com`).
-    """)
-
-    PLACEHOLDER_INSTRUCTIONS: str = (
-        "## Placeholder Syntax\n"
-        "ALL placeholders use {{param_name}} — the parameter's `type` field drives type coercion at resolution time.\n\n"
-        "- PARAMS: {{param_name}} (NO prefix, name matches parameter definition)\n"
-        "- SOURCES (use dot paths): {{cookie:name}}, {{sessionStorage:path.to.value}}, "
-        "{{localStorage:key}}, {{windowProperty:obj.key}}\n\n"
-        "EXAMPLES:\n"
-        '1. String param:     "name": "{{username}}"              -> "name": "john"   (type=string)\n'
-        '2. Number param:     "count": "{{limit}}"                -> "count": 50      (type=integer)\n'
-        '3. Bool param:       "active": "{{is_active}}"           -> "active": true   (type=boolean)\n'
-        '4. In URL:           "/api/{{user_id}}/data"             -> "/api/123/data"\n'
-        '5. Session storage:  "token": "{{sessionStorage:auth.access_token}}"\n'
-        '6. Cookie:           "sid": "{{cookie:session_id}}"\n\n'
-        "CRITICAL — MATCH TYPES TO THE RAW CDP REQUEST:\n"
-        'If the raw CDP request has "adults": "5" (a string), use type=string, NOT type=integer.\n'
-        "Integer would produce 5 (unquoted) and may break the API. Always match the type observed in the actual request."
-    )
-
-    PROMPT_VALIDATING: str = dedent("""\
-        ## Current Phase: VALIDATING — Test the Routine
-
-        1. Review the `validate_routine` execution results
-        2. Use `analyze_validation` to reflect:
-           - `analysis`: What worked and what failed
-           - `data_matches_task`: Does the returned data accomplish the user's original task?
-           - `next_action`: "done" | "fix_routine" | "retry_validation"
-        3. Based on your analysis:
-           - If data_matches_task=True and next_action="done": call `done`
-           - If data_matches_task=False: set next_action="fix_routine", then use construct_routine to fix and re-validate
-    """)
-
-    ## Magic methods
-
-    def __init__(
-        self,
-        emit_message_callable: Callable[[EmittedMessage], None],
-        network_data_loader: NetworkDataLoader,
-        task: str,
-        storage_data_loader: StorageDataLoader | None = None,
-        window_property_data_loader: WindowPropertyDataLoader | None = None,
-        js_data_loader: JSDataLoader | None = None,
-        interaction_data_loader: InteractionsDataLoader | None = None,
-        documentation_data_loader: DocumentationDataLoader | None = None,
-        llm_model: LLMModel = OpenAIModel.GPT_5_2,
-        subagent_llm_model: LLMModel | None = None,
-        max_iterations: int = 50,
-        remote_debugging_address: str | None = None,
-        persist_chat_callable: Callable[[Chat], Chat] | None = None,
-        persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None,
-        stream_chunk_callable: Callable[[str], None] | None = None,
-        chat_thread: ChatThread | None = None,
-        existing_chats: list[Chat] | None = None,
-    ) -> None:
-        """
-        Initialize the RoutineDiscoveryAgentBeta.
-
-        Args:
-            emit_message_callable: Callback to emit messages to the host.
-            network_data_loader: NetworkDataLoader with network traffic data.
-            task: The discovery task description.
-            storage_data_loader: Optional StorageDataLoader for browser storage.
-            window_property_data_loader: Optional WindowPropertyDataLoader for window properties.
-            js_data_loader: Optional JSDataLoader for JavaScript files.
-            interaction_data_loader: Optional InteractionsDataLoader for interaction events.
-            documentation_data_loader: Optional DocumentationDataLoader for docs and code files.
-            llm_model: LLM model for the orchestrator.
-            subagent_llm_model: LLM model for subagents (defaults to orchestrator's model).
-            max_iterations: Maximum iterations for the main loop.
-            remote_debugging_address: Chrome remote debugging address for validation.
-            persist_chat_callable: Optional callback to persist Chat objects.
-            persist_chat_thread_callable: Optional callback to persist ChatThread.
-            stream_chunk_callable: Optional callback for streaming text chunks.
-            chat_thread: Existing ChatThread to continue, or None for new.
-            existing_chats: Existing Chat messages if loading from persistence.
-        """
-        self._network_data_loader = network_data_loader
-        self._storage_data_loader = storage_data_loader
-        self._window_property_data_loader = window_property_data_loader
-        self._js_data_loader = js_data_loader
-        self._interaction_data_loader = interaction_data_loader
-        self._documentation_data_loader = documentation_data_loader
-        self._task = task
-        self._subagent_llm_model = subagent_llm_model or llm_model
-        self._max_iterations = max_iterations
-        self._remote_debugging_address = remote_debugging_address
-
-        # Internal state
-        self._orchestration_state = AgentOrchestrationState()
-        self._discovery_state = RoutineDiscoveryState(phase=DiscoveryPhase.PLANNING)
-        self._agent_instances: dict[str, AbstractSpecialist] = {}  # agent_id -> instance
-
-        # Result tracking
-        self._final_routine: Routine | None = None
-        self._failure_reason: str | None = None
-
-        super().__init__(
-            emit_message_callable=emit_message_callable,
-            persist_chat_callable=persist_chat_callable,
-            persist_chat_thread_callable=persist_chat_thread_callable,
-            stream_chunk_callable=stream_chunk_callable,
-            llm_model=llm_model,
-            chat_thread=chat_thread,
-            existing_chats=existing_chats,
-            documentation_data_loader=documentation_data_loader,
-        )
-
-    ## Abstract method implementations
-
-    def _get_system_prompt(self) -> str:
-        """Build the system prompt scoped to the current phase."""
-        phase = self._discovery_state.phase
-
-        # Core identity + delegation rules (always included)
-        prompt_parts = [self.PROMPT_CORE]
-
-        # Inject specialist descriptions from AgentCard metadata
-        specialist_lines = [
-            f"- `{agent_type.value}`: {cls.AGENT_CARD.description}"
-            for agent_type, cls in (
-                (SpecialistAgentType.NETWORK_SPECIALIST, NetworkSpecialist),
-                (SpecialistAgentType.VALUE_TRACE_RESOLVER, ValueTraceResolverSpecialist),
-                (SpecialistAgentType.JS_SPECIALIST, JSSpecialist),
-                (SpecialistAgentType.INTERACTION_SPECIALIST, InteractionSpecialist),
-            )
-        ]
-        prompt_parts.append("\n\n**Available specialists:**\n" + "\n".join(specialist_lines))
-
-        # Phase-specific instructions
-        if phase == DiscoveryPhase.PLANNING:
-            prompt_parts.append(self.PROMPT_PLANNING)
-        elif phase == DiscoveryPhase.DISCOVERING:
-            prompt_parts.append(self.PROMPT_DISCOVERING)
-        elif phase == DiscoveryPhase.CONSTRUCTING:
-            prompt_parts.append(self.PROMPT_CONSTRUCTING)
-            prompt_parts.append(self.PLACEHOLDER_INSTRUCTIONS)
-            prompt_parts.append(Routine.model_schema_markdown())
-        elif phase == DiscoveryPhase.VALIDATING:
-            prompt_parts.append(self.PROMPT_VALIDATING)
-            prompt_parts.append(self.PLACEHOLDER_INSTRUCTIONS)  # needed if fix_routine
-            prompt_parts.append(Routine.model_schema_markdown())  # needed if fix_routine
-
-        # Add data store summaries
-        data_loader_info = []
-        if self._network_data_loader:
-            stats = self._network_data_loader.stats
-            data_loader_info.append(f"Network: {stats.total_requests} transactions")
-        if self._storage_data_loader:
-            stats = self._storage_data_loader.stats
-            data_loader_info.append(f"Storage: {stats.total_events} events")
-        if self._window_property_data_loader:
-            stats = self._window_property_data_loader.stats
-            data_loader_info.append(f"Window: {stats.total_events} events")
-        if self._js_data_loader:
-            data_loader_info.append("JS files: available")
-        if self._documentation_data_loader:
-            summary = self._documentation_data_loader.stats.to_summary()
-            data_loader_info.append(f"Documentation: {summary}")
-
-        if data_loader_info:
-            prompt_parts.append(f"\n\n## Data Sources\n{', '.join(data_loader_info)}")
-
-        # Add current state
-        status = self._orchestration_state.get_queue_status()
-        prompt_parts.append(dedent(f"""\
-
-            ## Current State
-            - Phase: {self._discovery_state.phase.value}
-            - Pending tasks: {status['pending_tasks']}
-            - In-progress tasks: {status['in_progress_tasks']}
-            - Completed tasks: {status['completed_tasks']}
-            - Failed tasks: {status['failed_tasks']}
-        """))
-
-        # Add discovery state tracking info
-        discovery_status = self._discovery_state.get_queue_status()
-        if self._discovery_state.root_transaction or self._discovery_state.processed_transactions:
-            prompt_parts.append(dedent(f"""\
-
-            ## Discovery Progress
-            - Root transaction: {"Set" if self._discovery_state.root_transaction else "Not set"}
-            - Transaction queue: {discovery_status['pending_count']} pending, {discovery_status['processed_count']} processed
-            - Resolved variables: {len(self._discovery_state.all_resolved_variables)}
-            - Routine: {"Constructed" if self._discovery_state.production_routine else "Not constructed"}
-            """))
-
-        if self._remote_debugging_address:
-            prompt_parts.append("\n- Browser: Connected (validation available)")
-        else:
-            prompt_parts.append("\n- Browser: Not connected (skip validation)")
-
-        return "".join(prompt_parts)
-
-    ## Public API
-
-    def run(self) -> Routine | None:
-        """
-        Run the discovery to completion.
-
-        Returns:
-            The discovered Routine, or None if discovery failed.
-        """
-        # Seed the conversation with emphasis on delegation
-        initial_message = (
-            f"TASK: {self._task}\n\n"
-            "IMPORTANT: Start by delegating to network_specialist to find the relevant endpoint. "
-            "Call create_task(agent_type='network_specialist', prompt='Find the API endpoint for: <task>') "
-            "then run_pending_tasks(). DO NOT manually browse transactions yourself."
-        )
-        self._add_chat(ChatRole.USER, initial_message)
-
-        # Run the main loop
-        for iteration in range(self._max_iterations):
-            logger.debug("RotutineDiscoveryBeta iteration %d/%d, phase: %s",
-                        iteration + 1, self._max_iterations, self._discovery_state.phase.value)
-
-            # Check for completion
-            if self._discovery_state.phase == DiscoveryPhase.COMPLETE:
-                return self._final_routine
-
-            if self._discovery_state.phase == DiscoveryPhase.FAILED:
-                logger.error("Discovery failed: %s", self._failure_reason)
-                return None
-
-            # Run agent loop iteration
-            messages = self._build_messages_for_llm()
-            try:
-                response = self._call_llm(
-                    messages,
-                    self._get_system_prompt(),
-                    tool_choice="required",
-                )
-
-                if response.response_id:
-                    self._previous_response_id = response.response_id
-
-                if response.content or response.tool_calls:
-                    chat = self._add_chat(
-                        ChatRole.ASSISTANT,
-                        response.content or "",
-                        tool_calls=response.tool_calls if response.tool_calls else None,
-                        llm_provider_response_id=response.response_id,
-                    )
-                    if response.content:
-                        self._emit_message(
-                            ChatResponseEmittedMessage(
-                                content=response.content,
-                                chat_id=chat.id,
-                                chat_thread_id=self._thread.id,
-                            )
-                        )
-
-                if response.tool_calls:
-                    self._process_tool_calls(response.tool_calls)
-                else:
-                    # Prompt the agent to continue if no tool calls - provide phase-specific guidance
-                    phase = self._discovery_state.phase
-                    if phase == DiscoveryPhase.PLANNING:
-                        guidance = (
-                            "Phase: PLANNING. You MUST delegate to specialists! "
-                            "Call create_task(agent_type='network_specialist', prompt='Find the API endpoint for: <task>') "
-                            "then run_pending_tasks(). DO NOT use list_transactions or get_transaction directly."
-                        )
-                    elif phase == DiscoveryPhase.DISCOVERING:
-                        task_status = self._orchestration_state.get_queue_status()
-                        if task_status["pending_tasks"] > 0:
-                            guidance = (
-                                f"Phase: DISCOVERING. You have {task_status['pending_tasks']} pending tasks. "
-                                "Call run_pending_tasks() to execute them."
-                            )
-                        elif task_status["completed_tasks"] > 0:
-                            guidance = (
-                                "Phase: DISCOVERING. Tasks completed. Review results with get_task_result(task_id), "
-                                "then record findings using record_identified_endpoint, record_extracted_variable. "
-                                "For DYNAMIC_TOKENs, delegate to value_trace_resolver - don't use scan_for_value directly."
-                            )
-                        else:
-                            guidance = (
-                                "Phase: DISCOVERING. No tasks created yet! You MUST delegate: "
-                                "create_task(agent_type='network_specialist', prompt='...') then run_pending_tasks(). "
-                                "DO NOT manually inspect transactions - let specialists do the work."
-                            )
-                    elif phase == DiscoveryPhase.CONSTRUCTING:
-                        if not self._discovery_state.production_routine:
-                            guidance = (
-                                "Phase: CONSTRUCTING. Call get_discovery_context to see all discovered data, "
-                                "then use construct_routine to build the routine."
-                            )
-                        else:
-                            guidance = (
-                                "Phase: CONSTRUCTING. Routine already constructed. "
-                                "Proceed to validation or mark as done."
-                            )
-                    elif phase == DiscoveryPhase.VALIDATING:
-                        guidance = (
-                            "Phase: VALIDATING. Review the construct_routine execution results. "
-                            "If execution_success=True, call done. If execution_success=False, "
-                            "fix the issues and call construct_routine again."
-                        )
-                    else:
-                        guidance = f"Phase: {phase.value}. Use tools to make progress."
-
-                    self._add_chat(ChatRole.SYSTEM, f"[ACTION REQUIRED] {guidance}")
-
-            except Exception as e:
-                logger.exception("Error in RotutineDiscoveryBeta loop: %s", e)
-                self._emit_message(ErrorEmittedMessage(error=str(e)))
-                self._discovery_state.phase = DiscoveryPhase.FAILED
-                self._failure_reason = str(e)
-                return None
-
-        logger.warning("RotutineDiscoveryBeta hit max iterations (%d)", self._max_iterations)
-        self._discovery_state.phase = DiscoveryPhase.FAILED
-        self._failure_reason = f"Max iterations ({self._max_iterations}) reached"
-        return None
-
-    ## Internal methods
-
-    def _get_or_create_agent(self, task: Task) -> AbstractSpecialist:
-        """Get existing agent instance or create new one for the task."""
-        # Check if task specifies an existing agent
-        if task.agent_id and task.agent_id in self._agent_instances:
-            return self._agent_instances[task.agent_id]
-
-        # Create new agent based on type
-        agent_type = task.agent_type
-        agent = self._create_specialist(agent_type)
-
-        # Create SubAgent record and store instance
-        subagent = SubAgent(
-            type=agent_type,
-            llm_model=self._subagent_llm_model.value,
-        )
-        self._orchestration_state.subagents[subagent.id] = subagent
-        self._agent_instances[subagent.id] = agent
-
-        # Update task with agent_id
-        task.agent_id = subagent.id
-        subagent.task_ids.append(task.id)
-
-        return agent
-
-    def _create_specialist(self, agent_type: SpecialistAgentType) -> AbstractSpecialist:
-        """Create a specialist instance based on type."""
-        if agent_type == SpecialistAgentType.JS_SPECIALIST:
-            return JSSpecialist(
-                emit_message_callable=self._emit_message_callable,
-                llm_model=self._subagent_llm_model,
-                documentation_data_loader=self._documentation_data_loader,
-                network_data_loader=self._network_data_loader,
-                js_data_loader=None,  # NOTE: this is intentionally left None for now
-                remote_debugging_address=self._remote_debugging_address,
-                run_mode=RunMode.AUTONOMOUS,
-            )
-
-        elif agent_type == SpecialistAgentType.VALUE_TRACE_RESOLVER:
-            return ValueTraceResolverSpecialist(
-                emit_message_callable=self._emit_message_callable,
-                documentation_data_loader=self._documentation_data_loader,
-                network_data_loader=self._network_data_loader,
-                storage_data_loader=self._storage_data_loader,
-                window_property_data_loader=self._window_property_data_loader,
-                llm_model=self._subagent_llm_model,
-                run_mode=RunMode.AUTONOMOUS,
-            )
-
-        elif agent_type == SpecialistAgentType.NETWORK_SPECIALIST:
-            if not self._network_data_loader:
-                raise ValueError(
-                    "network_specialist requires network_data_loader, "
-                    "but it was not provided to RoutineDiscoveryAgentBeta"
-                )
-            return NetworkSpecialist(
-                emit_message_callable=self._emit_message_callable,
-                llm_model=self._subagent_llm_model,
-                network_data_loader=self._network_data_loader,
-                documentation_data_loader=self._documentation_data_loader,
-                run_mode=RunMode.AUTONOMOUS,
-            )
-
-        elif agent_type == SpecialistAgentType.INTERACTION_SPECIALIST:
-            if not self._interaction_data_loader:
-                raise ValueError(
-                    "interaction_specialist requires interaction_data_loader, "
-                    "but it was not provided to RoutineDiscoveryAgentBeta"
-                )
-            return InteractionSpecialist(
-                emit_message_callable=self._emit_message_callable,
-                interaction_data_loader=self._interaction_data_loader,
-                documentation_data_loader=self._documentation_data_loader,
-                llm_model=self._subagent_llm_model,
-                run_mode=RunMode.AUTONOMOUS,
-            )
-
-        else:
-            raise NotImplementedError(
-                f"Agent type {agent_type.value} is not yet supported. "
-                f"Available types: js_specialist, network_specialist, value_trace_resolver, interaction_specialist"
-            )
-
-    def _execute_task(self, task: Task) -> dict[str, Any]:
-        """Execute a task using the appropriate specialist."""
-        task.status = TaskStatus.IN_PROGRESS
-        task.started_at = datetime.now()
-
-        try:
-            agent = self._get_or_create_agent(task)
-
-            # Calculate remaining loops
-            remaining_loops = task.max_loops - task.loops_used
-            if remaining_loops <= 0:
-                task.status = TaskStatus.FAILED
-                task.error = "No loops remaining"
-                return {"success": False, "error": "No loops remaining"}
-
-            # Run autonomous with config - pass output schema here (not before)
-            # so it doesn't get cleared by _reset_autonomous_state()
-            config = AutonomousConfig(
-                min_iterations=1,  # Allow immediate finalization for resumed tasks
-                max_iterations=remaining_loops,
-            )
-
-            result = agent.run_autonomous(
-                task=task.prompt,
-                config=config,
-                output_schema=task.output_schema,
-                output_description=task.output_description,
-            )
-
-            # Update loops used
-            task.loops_used += agent.autonomous_iteration
-
-            if result is not None:
-                task.status = TaskStatus.COMPLETED
-                task.completed_at = datetime.now()
-                task.result = result.model_dump() if isinstance(result, BaseModel) else result
-                return {"success": True, "result": task.result}
-            else:
-                # Agent hit max iterations without finalizing
-                if task.loops_used < task.max_loops:
-                    task.status = TaskStatus.PAUSED
-                    return {"success": False, "status": "paused", "loops_used": task.loops_used}
-                else:
-                    task.status = TaskStatus.FAILED
-                    task.error = "Max loops reached without result"
-                    return {"success": False, "error": task.error}
-
-        except Exception as e:
-            task.status = TaskStatus.FAILED
-            task.error = str(e)
-            task.completed_at = datetime.now()
-            logger.error("Task %s failed: %s", task.id, e)
-            return {"success": False, "error": str(e)}
-
-    def _validate_discovery_completeness(self) -> tuple[bool, list[str]]:
-        """
-        Check if discovery state is complete enough to construct routine.
-
-        Returns:
-            Tuple of (is_complete, list_of_blockers).
-            If is_complete is False, blockers explain what's missing.
-        """
-        blockers = []
-
-        # Check if root transaction is set
-        if not self._discovery_state.root_transaction:
-            blockers.append("No root transaction recorded")
-
-        # Check for unresolved dynamic tokens
-        unresolved_tokens = []
-        for tx_id, tx_data in self._discovery_state.transaction_data.items():
-            if tx_data.get("extracted_variables"):
-                extracted = tx_data["extracted_variables"]
-                resolved_names = {
-                    rv.variable.name
-                    for rv in tx_data.get("resolved_variables", [])
-                }
-                for var in extracted.variables:
-                    if var.requires_dynamic_resolution and var.name not in resolved_names:
-                        unresolved_tokens.append(f"{var.name} (in {tx_id})")
-
-        if unresolved_tokens:
-            blockers.append(f"Unresolved dynamic tokens: {', '.join(unresolved_tokens)}")
-
-        # Check if transaction queue is not empty
-        if self._discovery_state.transaction_queue:
-            blockers.append(
-                f"Transaction dependencies pending: {self._discovery_state.transaction_queue}"
-            )
-
-        is_complete = len(blockers) == 0
-        return is_complete, blockers
-
-    def _get_discovery_summary(self) -> str:
-        """
-        Get a human-readable summary of the current discovery state.
-
-        Returns:
-            Formatted string summarizing discovery progress.
-        """
-        lines = []
-        lines.append("=== Discovery State Summary ===")
-
-        # Root transaction
-        if self._discovery_state.root_transaction:
-            root = self._discovery_state.root_transaction
-            lines.append(f"Root Transaction: {root.url} ({root.method.value})")
-        else:
-            lines.append("Root Transaction: Not set")
-
-        # Transaction processing
-        status = self._discovery_state.get_queue_status()
-        lines.append(
-            f"Transactions: {status['processed_count']} processed, "
-            f"{status['pending_count']} pending"
-        )
-
-        # Variables
-        params = [
-            rv.variable for rv in self._discovery_state.all_resolved_variables
-            if rv.variable.type == VariableType.PARAMETER
-        ]
-        tokens = [
-            rv.variable for rv in self._discovery_state.all_resolved_variables
-            if rv.variable.type == VariableType.DYNAMIC_TOKEN
-        ]
-        statics = [
-            rv.variable for rv in self._discovery_state.all_resolved_variables
-            if rv.variable.type == VariableType.STATIC_VALUE
-        ]
-
-        lines.append(f"Parameters: {len(params)} ({', '.join(p.name for p in params) if params else 'none'})")
-        lines.append(f"Dynamic Tokens: {len(tokens)} ({', '.join(t.name for t in tokens) if tokens else 'none'})")
-        lines.append(f"Static Values: {len(statics)}")
-
-        # Routine status
-        if self._discovery_state.production_routine:
-            routine = self._discovery_state.production_routine
-            lines.append(
-                f"Routine: Constructed ({len(routine.parameters)} params, "
-                f"{len(routine.operations)} operations)"
-            )
-        else:
-            lines.append("Routine: Not constructed")
-
-        # Completeness check
-        is_complete, blockers = self._validate_discovery_completeness()
-        if is_complete:
-            lines.append("Status: Ready to construct routine")
-        else:
-            lines.append(f"Status: Not ready - {'; '.join(blockers)}")
-
-        return "\n".join(lines)
-
-    ## Tools - Task Management
-
-    # Available agent types for task creation
-    AVAILABLE_AGENT_TYPES = {
-        SpecialistAgentType.JS_SPECIALIST,
-        SpecialistAgentType.NETWORK_SPECIALIST,
-        SpecialistAgentType.VALUE_TRACE_RESOLVER,
-        SpecialistAgentType.INTERACTION_SPECIALIST,
-    }
-
-    @agent_tool(
-        description="Create a new task for a specialist subagent (network_specialist, value_trace_resolver, js_specialist, interaction_specialist).",
-        parameters={
-            "type": "object",
-            "properties": {
-                "agent_type": {
-                    "type": "string",
-                    "enum": ["network_specialist", "value_trace_resolver", "js_specialist", "interaction_specialist"],
-                    "description": "Type of specialist agent"
-                },
-                "prompt": {
-                    "type": "string",
-                    "description": "Task instructions for the specialist"
-                },
-                "agent_id": {
-                    "type": "string",
-                    "description": "Optional ID of existing agent to reuse"
-                },
-                "max_loops": {
-                    "type": "integer",
-                    "default": 15,
-                    "description": "Maximum LLM iterations for this task"
-                },
-                "output_schema": {
-                    "type": "object",
-                    "description": "JSON Schema defining expected output structure"
-                },
-                "output_description": {
-                    "type": "string",
-                    "description": "Human-readable description of expected output"
-                },
-                "context": {
-                    "type": "object",
-                    "description": "Additional context data for the specialist"
-                }
-            },
-            "required": ["agent_type", "prompt"]
-        },
-        availability=True,
-    )
-    def _create_task(
-        self,
-        agent_type: str,
-        prompt: str,
-        agent_id: str | None = None,
-        max_loops: int = 15,
-        output_schema: dict[str, Any] | None = None,
-        output_description: str | None = None,
-        context: dict[str, Any] | None = None,
-    ) -> dict[str, Any]:
-        """
-        Create a new task for a specialist subagent.
-
-        Args:
-            agent_type: Type of specialist (js_specialist, network_specialist, value_trace_resolver, interaction_specialist).
-            prompt: Task instructions for the specialist.
-            agent_id: Optional ID of existing agent to reuse (preserves context).
-            max_loops: Maximum LLM iterations for this task (default 15).
-            output_schema: JSON Schema defining expected output structure.
-            output_description: Human-readable description of expected output.
-            context: Additional context data for the specialist.
-        """
-        try:
-            parsed_type = SpecialistAgentType(agent_type)
-        except ValueError:
-            valid_types = [t.value for t in self.AVAILABLE_AGENT_TYPES]
-            return {"error": f"Invalid agent_type. Must be one of: {valid_types}"}
-
-        if parsed_type not in self.AVAILABLE_AGENT_TYPES:
-            valid_types = [t.value for t in self.AVAILABLE_AGENT_TYPES]
-            return {"error": f"Agent type '{agent_type}' not available. Use: {valid_types}"}
-
-        task = Task(
-            agent_type=parsed_type,
-            agent_id=agent_id,
-            prompt=prompt,
-            max_loops=max_loops,
-            output_schema=output_schema,
-            output_description=output_description,
-            context=context or {},
-        )
-
-        self._orchestration_state.add_task(task)
-        self._discovery_state.phase = DiscoveryPhase.DISCOVERING
-
-        result: dict[str, Any] = {
-            "success": True,
-            "task_id": task.id,
-            "agent_type": agent_type,
-            "message": "Task created. Use run_pending_tasks to execute.",
-        }
-        if output_schema:
-            result["output_schema_set"] = True
-        if output_description:
-            result["output_description_set"] = True
-
-        return result
-
-    @agent_tool(
-        description="List all tasks and their current status.",
-        parameters={"type": "object", "properties": {}, "required": []},
-        availability=True,
-    )
-    def _list_tasks(self) -> dict[str, Any]:
-        """List all tasks and their current status."""
-        tasks_summary = []
-        for task in self._orchestration_state.tasks.values():
-            tasks_summary.append({
-                "id": task.id,
-                "agent_type": task.agent_type,
-                "status": task.status.value,
-                "prompt": task.prompt[:100] + "..." if len(task.prompt) > 100 else task.prompt,
-                "loops_used": task.loops_used,
-                "max_loops": task.max_loops,
-            })
-
-        return {
-            "total": len(tasks_summary),
-            "pending": len(self._orchestration_state.get_pending_tasks()),
-            "in_progress": len(self._orchestration_state.get_in_progress_tasks()),
-            "completed": len(self._orchestration_state.get_completed_tasks()),
-            "failed": len(self._orchestration_state.get_failed_tasks()),
-            "tasks": tasks_summary,
-        }
-
-    @agent_tool(
-        description="Get the result of a completed task.",
-        parameters={
-            "type": "object",
-            "properties": {
-                "task_id": {
-                    "type": "string",
-                    "description": "The ID of the task to get results for"
-                }
-            },
-            "required": ["task_id"]
-        },
-        availability=True,
-    )
-    def _get_task_result(self, task_id: str) -> dict[str, Any]:
-        """
-        Get the result of a completed task.
-
-        Args:
-            task_id: The ID of the task to get results for.
-        """
-        task = self._orchestration_state.tasks.get(task_id)
-        if not task:
-            return {"error": f"Task {task_id} not found"}
-
-        return {
-            "task_id": task.id,
-            "status": task.status.value,
-            "result": task.result,
-            "error": task.error,
-            "loops_used": task.loops_used,
-        }
-
-    @agent_tool(
-        description="Execute all pending tasks and return their results.",
-        parameters={"type": "object", "properties": {}, "required": []},
-        availability=True,
-    )
-    def _run_pending_tasks(self) -> dict[str, Any]:
-        """Execute all pending tasks concurrently and return their results."""
-        pending = self._orchestration_state.get_pending_tasks()
-        if not pending:
-            return {"message": "No pending tasks", "results": []}
-
-        if len(pending) == 1:
-            # Single task — no threading overhead
-            task = pending[0]
-            result = self._execute_task(task)
-            results = [{"task_id": task.id, "agent_type": task.agent_type, **result}]
-        else:
-            # Multiple independent tasks — run in parallel
-            results = []
-            with ThreadPoolExecutor(max_workers=len(pending)) as executor:
-                future_to_task = {
-                    executor.submit(self._execute_task, task): task
-                    for task in pending
-                }
-                for future in as_completed(future_to_task):
-                    task = future_to_task[future]
-                    try:
-                        result = future.result()
-                    except Exception as e:
-                        logger.error("Task %s raised exception: %s", task.id, e)
-                        result = {"success": False, "error": str(e)}
-                    results.append({
-                        "task_id": task.id,
-                        "agent_type": task.agent_type,
-                        **result,
-                    })
-            # Preserve original task order for deterministic output
-            task_order = {task.id: i for i, task in enumerate(pending)}
-            results.sort(
-                key=lambda r: task_order.get(r["task_id"], 0)
-            )
-
-        # Check if all tasks are done and update phase
-        phase_message = None
-        if not self._orchestration_state.get_pending_tasks() and not self._orchestration_state.get_in_progress_tasks():
-            if self._orchestration_state.get_failed_tasks():
-                phase_message = "Some tasks failed. Review results and decide next steps."
-            else:
-                # All tasks completed successfully
-                # Check if we can transition to CONSTRUCTING
-                can_construct = True
-                construction_blockers = []
-
-                # Check if root transaction is set
-                if not self._discovery_state.root_transaction:
-                    construction_blockers.append("No root transaction recorded (use record_identified_endpoint)")
-
-                # Check if any unresolved dynamic tokens exist
-                unresolved_tokens = []
-                for tx_id, tx_data in self._discovery_state.transaction_data.items():
-                    if tx_data.get("extracted_variables"):
-                        extracted = tx_data["extracted_variables"]
-                        resolved_names = {
-                            rv.variable.name
-                            for rv in tx_data.get("resolved_variables", [])
-                        }
-                        for var in extracted.variables:
-                            if var.requires_dynamic_resolution and var.name not in resolved_names:
-                                unresolved_tokens.append(var.name)
-
-                if unresolved_tokens:
-                    construction_blockers.append(
-                        f"Unresolved dynamic tokens: {unresolved_tokens} "
-                        f"(use value_trace_resolver and record_resolved_variable)"
-                    )
-
-                # Check if transaction queue is not empty (dependencies pending)
-                if self._discovery_state.transaction_queue:
-                    construction_blockers.append(
-                        f"Transaction queue not empty: {self._discovery_state.transaction_queue} "
-                        f"(process dependencies first)"
-                    )
-
-                if construction_blockers:
-                    can_construct = False
-                    phase_message = (
-                        "All tasks completed, but cannot construct routine yet. Blockers: " +
-                        "; ".join(construction_blockers)
-                    )
-                else:
-                    # Can transition to CONSTRUCTING
-                    self._discovery_state.phase = DiscoveryPhase.CONSTRUCTING
-                    phase_message = (
-                        "All tasks completed and discovery is complete! "
-                        "Use get_discovery_context to see all discovered data, "
-                        "then construct_routine to build the routine."
-                    )
-
-        result = {
-            "executed": len(results),
-            "results": results,
-            "phase": self._discovery_state.phase.value,
-        }
-
-        if phase_message:
-            result["phase_message"] = phase_message
-
-        return result
-
-    ## Tools - Data Access
-
-    @agent_tool(
-        description="[PREFER network_specialist] List transaction IDs. For finding the RIGHT endpoint, delegate to network_specialist instead - it can search semantically.",
-        parameters={"type": "object", "properties": {}, "required": []},
-        availability=lambda self: self._network_data_loader is not None,
-    )
-    def _list_transactions(self) -> dict[str, Any]:
-        """List all available transaction IDs from the network captures."""
-        if not self._network_data_loader:
-            return {"error": "No network data store available"}
-
-        entries = self._network_data_loader.entries
-        # Filter to likely-useful API entries (skip static assets)
-        static_extensions = ('.js', '.css', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.woff', '.woff2', '.ttf')
-        api_entries = [e for e in entries if not any(e.url.split('?')[0].endswith(ext) for ext in static_extensions)]
-        tx_summaries = [
-            {"id": e.request_id, "method": e.method, "url": e.url[:100]}
-            for e in api_entries
-        ]
-        return {
-            "transactions": tx_summaries,
-            "count": len(entries),
-            "showing": len(tx_summaries),
-            "filtered_out": len(entries) - len(api_entries),
-        }
-
-    @agent_tool(
-        description="Get full details of a transaction. Use AFTER network_specialist identifies the right transaction ID.",
-        parameters={
-            "type": "object",
-            "properties": {
-                "transaction_id": {
-                    "type": "string",
-                    "description": "The ID of the transaction to retrieve"
-                }
-            },
-            "required": ["transaction_id"]
-        },
-        availability=lambda self: self._network_data_loader is not None,
-    )
-    def _get_transaction(self, transaction_id: str) -> dict[str, Any]:
-        """
-        Get full details of a transaction.
-
-        Args:
-            transaction_id: The ID of the transaction to retrieve.
-        """
-        if not self._network_data_loader:
-            return {"error": "No network data store available"}
-
-        entry = self._network_data_loader.get_entry(transaction_id)
-        if not entry:
-            # Show some available IDs as hints
-            available = [e.request_id for e in self._network_data_loader.entries[:10]]
-            return {"error": f"Transaction {transaction_id} not found. Sample IDs: {available}"}
-
-        max_body_len = 5_000
-        response_body = entry.response_body
-        truncated = False
-        original_length = 0
-        if response_body:
-            original_length = len(response_body)
-            if original_length > max_body_len:
-                response_body = response_body[:max_body_len]
-                truncated = True
-
-        result: dict[str, Any] = {
-            "transaction_id": transaction_id,
-            "method": entry.method,
-            "url": entry.url,
-            "status": entry.status,
-            "request_headers": entry.request_headers,
-            "post_data": entry.post_data,
-            "response_headers": entry.response_headers,
-            "response_body": response_body,
-        }
-        if truncated:
-            result["response_body_truncated"] = True
-            result["response_body_full_length"] = original_length
-            result["response_body_note"] = (
-                f"Response body truncated to {max_body_len} chars "
-                f"(full length: {original_length}). "
-                f"Delegate to network_specialist for full body search."
-            )
-        return result
-
-    @agent_tool(
-        description=(
-            "[PREFER value_trace_resolver SPECIALIST] Basic value search. "
-            "For DYNAMIC_TOKENs, delegate to value_trace_resolver instead - it has deeper analysis capabilities."
-        ),
-        parameters={
-            "type": "object",
-            "properties": {
-                "value": {
-                    "type": "string",
-                    "description": "The value to search for"
-                },
-                "exclude_transaction_id": {
-                    "type": "string",
-                    "description": "Transaction ID to exclude from search (usually the one containing the value)"
-                }
-            },
-            "required": ["value"]
-        },
-        availability=True,
-    )
-    def _scan_for_value(
-        self,
-        value: str,
-        exclude_transaction_id: str | None = None
-    ) -> dict[str, Any]:
-        """
-        Search for a value across all data sources.
-
-        Args:
-            value: The value to search for.
-            exclude_transaction_id: Transaction ID to exclude from search.
-        """
-        results: dict[str, Any] = {
-            "value": value,
-            "found_in": [],
-        }
-
-        # Search network transactions
-        if self._network_data_loader:
-            for entry in self._network_data_loader.entries:
-                if exclude_transaction_id and entry.request_id == exclude_transaction_id:
-                    continue
-
-                # Search response body
-                if entry.response_body and value in entry.response_body:
-                    results["found_in"].append({
-                        "source_type": "transaction",
-                        "transaction_id": entry.request_id,
-                        "location": "response_body",
-                        "url": entry.url[:100],
-                    })
-
-                # Search response headers
-                if entry.response_headers:
-                    for header_name, header_value in entry.response_headers.items():
-                        if value in str(header_value):
-                            results["found_in"].append({
-                                "source_type": "transaction",
-                                "transaction_id": entry.request_id,
-                                "location": f"response_header:{header_name}",
-                                "url": entry.url[:100],
-                            })
-
-                # Search request headers
-                if entry.request_headers:
-                    for header_name, header_value in entry.request_headers.items():
-                        if value in str(header_value):
-                            results["found_in"].append({
-                                "source_type": "transaction",
-                                "transaction_id": entry.request_id,
-                                "location": f"request_header:{header_name}",
-                                "url": entry.url[:100],
-                            })
-
-                # Search request body (post_data)
-                if entry.post_data:
-                    post_data_str = entry.post_data if isinstance(entry.post_data, str) else json.dumps(entry.post_data)
-                    if value in post_data_str:
-                        results["found_in"].append({
-                            "source_type": "transaction",
-                            "transaction_id": entry.request_id,
-                            "location": "request_body",
-                            "url": entry.url[:100],
-                        })
-
-        # Search storage
-        if self._storage_data_loader:
-            for event in self._storage_data_loader.entries:
-                if hasattr(event, 'value') and event.value and value in str(event.value):
-                    results["found_in"].append({
-                        "source_type": "storage",
-                        "storage_type": event.storage_type if hasattr(event, 'storage_type') else "unknown",
-                        "key": event.key if hasattr(event, 'key') else "unknown",
-                    })
-
-        # Search window properties
-        if self._window_property_data_loader:
-            for event in self._window_property_data_loader.entries:
-                if hasattr(event, 'value') and event.value and value in str(event.value):
-                    results["found_in"].append({
-                        "source_type": "window_property",
-                        "path": event.path if hasattr(event, 'path') else "unknown",
-                    })
-
-        results["total_matches"] = len(results["found_in"])
-        return results
-
-    ## Tools - State Population
-
-    @agent_tool(
-        description="Record the main transaction identified (root transaction for routine).",
-        parameters={
-            "type": "object",
-            "properties": {
-                "request_id": {
-                    "type": "string",
-                    "description": "The transaction ID (HAR entry ID)"
-                },
-                "url": {
-                    "type": "string",
-                    "description": "The URL of the endpoint"
-                },
-                "method": {
-                    "type": "string",
-                    "enum": ["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"],
-                    "description": "HTTP method"
-                },
-                "description": {
-                    "type": "string",
-                    "description": "What this transaction does"
-                }
-            },
-            "required": ["request_id", "url", "method", "description"]
-        },
-        availability=lambda self: self._network_data_loader is not None,
-    )
-    def _record_identified_endpoint(
-        self,
-        request_id: str,
-        url: str,
-        method: str,
-        description: str
-    ) -> dict[str, Any]:
-        """
-        Record the main transaction identified by network_specialist.
-        This becomes the root_transaction in discovery state.
-
-        Args:
-            request_id: The HAR entry ID from network_specialist results.
-            url: The URL of the endpoint.
-            method: HTTP method (GET, POST, etc).
-            description: What this transaction does.
-        """
-        # Validate request_id exists in network data
-        if not self._network_data_loader:
-            return {"error": "No network data loader available"}
-
-        entry = self._network_data_loader.get_entry(request_id)
-        if not entry:
-            available_ids = [e.request_id for e in self._network_data_loader.entries[:10]]
-            return {
-                "error": f"Request ID '{request_id}' not found",
-                "sample_ids": available_ids
-            }
-
-        # Parse HTTP method
-        try:
-            http_method = HTTPMethod(method.upper())
-        except ValueError:
-            return {"error": f"Invalid HTTP method '{method}'. Use GET, POST, PUT, DELETE, etc."}
-
-        # Create TransactionIdentificationResponse
-        root_transaction = TransactionIdentificationResponse(
-            transaction_id=request_id,
-            description=description,
-            url=url,
-            method=http_method,
-            short_explanation=f"Main endpoint for {description}"
-        )
-
-        # Store in discovery state
-        self._discovery_state.root_transaction = root_transaction
-
-        # Add to transaction queue
-        added, position = self._discovery_state.add_to_queue(request_id)
-
-        # Initialize transaction data
-        self._discovery_state.store_transaction_data(
-            transaction_id=request_id,
-            request={
-                "url": entry.url,
-                "method": entry.method,
-                "headers": entry.request_headers,
-                "body": entry.post_data,
-            }
-        )
-
-        # Transition to DISCOVERING phase
-        self._discovery_state.phase = DiscoveryPhase.DISCOVERING
-
-        return {
-            "success": True,
-            "transaction_id": request_id,
-            "added_to_queue": added,
-            "queue_position": position,
-            "message": f"Recorded root transaction: {url}"
-        }
-
-    @agent_tool(
-        description="Record a variable discovered from analyzing a transaction (parameter, dynamic_token, or static_value).",
-        parameters={
-            "type": "object",
-            "properties": {
-                "transaction_id": {
-                    "type": "string",
-                    "description": "The transaction this variable belongs to"
-                },
-                "name": {
-                    "type": "string",
-                    "description": "Variable name (e.g., 'origin_city', 'x-trace-id')"
-                },
-                "type": {
-                    "type": "string",
-                    "enum": ["parameter", "dynamic_token", "static_value"],
-                    "description": "Variable type"
-                },
-                "observed_value": {
-                    "type": "string",
-                    "description": "The actual value seen in the capture"
-                },
-                "requires_dynamic_resolution": {
-                    "type": "boolean",
-                    "description": "True if value must be resolved at runtime"
-                },
-                "values_to_scan_for": {
-                    "type": "array",
-                    "items": {"type": "string"},
-                    "description": "Optional list of values to search for"
-                }
-            },
-            "required": ["transaction_id", "name", "type", "observed_value", "requires_dynamic_resolution"]
-        },
-        availability=lambda self: self._discovery_state.root_transaction is not None,
-    )
-    def _record_extracted_variable(
-        self,
-        transaction_id: str,
-        name: str,
-        type: str,
-        observed_value: str,
-        requires_dynamic_resolution: bool,
-        values_to_scan_for: list[str] | None = None
-    ) -> dict[str, Any]:
-        """
-        Record a variable discovered from analyzing a transaction.
-
-        Args:
-            transaction_id: The transaction this variable belongs to.
-            name: Variable name (e.g., "origin_city", "x-trace-id").
-            type: Variable type - "parameter", "dynamic_token", or "static_value".
-            observed_value: The actual value seen in the capture.
-            requires_dynamic_resolution: True if value must be resolved at runtime.
-            values_to_scan_for: Optional list of values to search for (defaults to [observed_value]).
-        """
-        # Validate variable type
-        try:
-            var_type = VariableType(type)
-        except ValueError:
-            return {
-                "error": f"Invalid variable type '{type}'. Use: parameter, dynamic_token, or static_value"
-            }
-
-        # Create Variable object
-        variable = Variable(
-            type=var_type,
-            requires_dynamic_resolution=requires_dynamic_resolution,
-            name=name,
-            observed_value=observed_value,
-            values_to_scan_for=values_to_scan_for or [observed_value]
-        )
-
-        # Check if transaction_data exists for this transaction
-        if transaction_id not in self._discovery_state.transaction_data:
-            self._discovery_state.transaction_data[transaction_id] = {
-                "request": None,
-                "extracted_variables": None,
-                "resolved_variables": []
-            }
-
-        # Get or create ExtractedVariableResponse
-        tx_data = self._discovery_state.transaction_data[transaction_id]
-        if tx_data.get("extracted_variables") is None:
-            extracted = ExtractedVariableResponse(
-                transaction_id=transaction_id,
-                variables=[variable]
-            )
-            tx_data["extracted_variables"] = extracted
-        else:
-            # Add to existing variables
-            tx_data["extracted_variables"].variables.append(variable)
-
-        return {
-            "success": True,
-            "transaction_id": transaction_id,
-            "variable_name": name,
-            "variable_type": type,
-            "requires_resolution": requires_dynamic_resolution,
-            "message": f"Recorded variable '{name}' for transaction {transaction_id}"
-        }
-
-    @agent_tool(
-        description="Record how to resolve a dynamic token (storage, window_property, or transaction source). Auto-adds dependency transactions.",
-        parameters={
-            "type": "object",
-            "properties": {
-                "variable_name": {
-                    "type": "string",
-                    "description": "Name of the variable being resolved"
-                },
-                "transaction_id": {
-                    "type": "string",
-                    "description": "The transaction this variable belongs to"
-                },
-                "source_type": {
-                    "type": "string",
-                    "enum": ["storage", "window_property", "transaction"],
-                    "description": "Where the value comes from"
-                },
-                "storage_source": {
-                    "type": "object",
-                    "properties": {
-                        "type": {
-                            "type": "string",
-                            "enum": ["cookie", "localStorage", "sessionStorage"]
-                        },
-                        "dot_path": {"type": "string"}
-                    },
-                    "description": "For storage source"
-                },
-                "window_property_source": {
-                    "type": "object",
-                    "properties": {
-                        "dot_path": {"type": "string"}
-                    },
-                    "description": "For window source"
-                },
-                "transaction_source": {
-                    "type": "object",
-                    "properties": {
-                        "transaction_id": {"type": "string"},
-                        "dot_path": {"type": "string"}
-                    },
-                    "description": "For transaction source"
-                }
-            },
-            "required": ["variable_name", "transaction_id", "source_type"]
-        },
-        availability=lambda self: self._discovery_state.root_transaction is not None,
-    )
-    def _record_resolved_variable(
-        self,
-        variable_name: str,
-        transaction_id: str,
-        source_type: str,
-        storage_source: dict[str, str] | None = None,
-        window_property_source: dict[str, str] | None = None,
-        transaction_source: dict[str, str] | None = None,
-    ) -> dict[str, Any]:
-        """
-        Record how to resolve a dynamic token.
-
-        Args:
-            variable_name: Name of the variable being resolved.
-            transaction_id: The transaction this variable belongs to.
-            source_type: Where the value comes from ("storage", "window_property", "transaction").
-            storage_source: For storage source - {"type": "cookie|localStorage|sessionStorage", "dot_path": "path"}.
-            window_property_source: For window property source - {"dot_path": "path"}.
-            transaction_source: For transaction source - {"transaction_id": "id", "dot_path": "path"}.
-        """
-        # Get the variable from extracted variables
-        tx_data = self._discovery_state.transaction_data.get(transaction_id)
-        if not tx_data or not tx_data.get("extracted_variables"):
-            return {"error": f"No extracted variables found for transaction {transaction_id}"}
-
-        extracted = tx_data["extracted_variables"]
-        variable = None
-        for var in extracted.variables:
-            if var.name == variable_name:
-                variable = var
-                break
-
-        if not variable:
-            available = [v.name for v in extracted.variables]
-            return {
-                "error": f"Variable '{variable_name}' not found in transaction {transaction_id}",
-                "available_variables": available
-            }
-
-        # Build the source object based on source_type
-        source = None
-        dependency_added = False
-
-        if source_type == "storage":
-            if not storage_source:
-                return {"error": "storage_source required for source_type='storage'"}
-            try:
-                storage_type = SessionStorageType(storage_source["type"])
-            except (KeyError, ValueError):
-                return {"error": "storage_source must have 'type' (cookie, localStorage, sessionStorage) and 'dot_path'"}
-            source = SessionStorageSource(
-                type=storage_type,
-                dot_path=storage_source.get("dot_path", "")
-            )
-
-        elif source_type == "window_property":
-            if not window_property_source:
-                return {"error": "window_property_source required for source_type='window_property'"}
-            source = WindowPropertySource(
-                dot_path=window_property_source.get("dot_path", "")
-            )
-
-        elif source_type == "transaction":
-            if not transaction_source:
-                return {"error": "transaction_source required for source_type='transaction'"}
-            source_tx_id = transaction_source.get("transaction_id")
-            if not source_tx_id:
-                return {"error": "transaction_source must have 'transaction_id' and 'dot_path'"}
-
-            dot_path = transaction_source.get("dot_path", "")
-
-            # Validate that dot_path resolves in the source transaction's response
-            if dot_path and self._network_data_loader:
-                source_entry = self._network_data_loader.get_entry(source_tx_id)
-                if source_entry and source_entry.response_body:
-                    resolved_value = resolve_dotted_path(logger, source_entry.response_body, dot_path)
-                    if resolved_value is None:
-                        return {
-                            "error": (
-                                f"dot_path '{dot_path}' does not resolve to a value in transaction {source_tx_id}'s "
-                                "response body. Verify the path is correct."
-                            )
-                        }
-
-            source = TransactionSource(
-                transaction_id=source_tx_id,
-                dot_path=dot_path,
-            )
-
-            # Auto-add dependency transaction to queue
-            added, position = self._discovery_state.add_to_queue(source_tx_id)
-            if added:
-                dependency_added = True
-                # Initialize transaction data for dependency if not exists
-                if source_tx_id not in self._discovery_state.transaction_data:
-                    entry = self._network_data_loader.get_entry(source_tx_id) if self._network_data_loader else None
-                    if entry:
-                        self._discovery_state.store_transaction_data(
-                            transaction_id=source_tx_id,
-                            request={
-                                "url": entry.url,
-                                "method": entry.method,
-                                "headers": entry.request_headers,
-                                "body": entry.post_data,
-                            }
-                        )
-
-        else:
-            return {"error": f"Invalid source_type '{source_type}'. Use: storage, window_property, transaction"}
-
-        # Create ResolvedVariableResponse
-        resolved = ResolvedVariableResponse(
-            variable=variable,
-            source=source
-        )
-
-        # Store in transaction data
-        if "resolved_variables" not in tx_data:
-            tx_data["resolved_variables"] = []
-        tx_data["resolved_variables"].append(resolved)
-
-        result = {
-            "success": True,
-            "variable_name": variable_name,
-            "source_type": source_type,
-            "message": f"Recorded resolution for '{variable_name}'"
-        }
-
-        if dependency_added:
-            result["dependency_added"] = source_tx_id
-            result["message"] += f" (dependency transaction {source_tx_id} added to queue)"
-
-        return result
-
-    @agent_tool(
-        description="Mark a transaction as fully processed (all variables extracted and resolved). Removes from queue.",
-        parameters={
-            "type": "object",
-            "properties": {
-                "transaction_id": {
-                    "type": "string",
-                    "description": "The transaction ID to mark as processed"
-                }
-            },
-            "required": ["transaction_id"]
-        },
-        availability=lambda self: self._discovery_state.root_transaction is not None,
-    )
-    def _mark_transaction_processed(self, transaction_id: str) -> dict[str, Any]:
-        """
-        Mark a transaction as fully processed.
-
-        Call this when you've extracted all variables and resolved all dynamic tokens
-        for a transaction. This removes it from the queue and adds it to processed list.
-
-        Args:
-            transaction_id: The transaction ID to mark as processed.
-        """
-        # Check if transaction exists in our data
-        if transaction_id not in self._discovery_state.transaction_data:
-            return {"error": f"Transaction {transaction_id} not found in discovery data"}
-
-        # Check for unresolved dynamic tokens
-        tx_data = self._discovery_state.transaction_data[transaction_id]
-        unresolved = []
-        if tx_data.get("extracted_variables"):
-            resolved_names = {
-                rv.variable.name
-                for rv in tx_data.get("resolved_variables", [])
-            }
-            for var in tx_data["extracted_variables"].variables:
-                if var.requires_dynamic_resolution and var.name not in resolved_names:
-                    unresolved.append(var.name)
-
-        if unresolved:
-            return {
-                "error": f"Cannot mark as processed - unresolved dynamic tokens: {unresolved}",
-                "hint": "Use scan_for_value and record_resolved_variable for each token first"
-            }
-
-        # Remove from queue if present
-        if transaction_id in self._discovery_state.transaction_queue:
-            self._discovery_state.transaction_queue.remove(transaction_id)
-
-        # Mark as processed
-        self._discovery_state.mark_transaction_complete(transaction_id)
-
-        # Get next transaction in queue
-        queue_status = self._discovery_state.get_queue_status()
-
-        return {
-            "success": True,
-            "transaction_id": transaction_id,
-            "message": f"Transaction {transaction_id} marked as processed",
-            "remaining_queue": queue_status["pending"],
-            "processed_count": queue_status["processed_count"],
-        }
-
-    @agent_tool()
-    def _get_discovery_context(self) -> dict[str, Any]:
-        """Get complete discovery context for routine construction."""
-        # Build CRITICAL observed values reminder - this goes at the TOP
-        observed_values_for_params: dict[str, str] = {}
-        for tx_id, tx_data in self._discovery_state.transaction_data.items():
-            if tx_data.get("extracted_variables"):
-                for var in tx_data["extracted_variables"].variables:
-                    if var.type == VariableType.PARAMETER and var.observed_value:
-                        observed_values_for_params[var.name] = var.observed_value
-
-        context: dict[str, Any] = {
-            "phase": self._discovery_state.phase.value,
-            "CRITICAL_OBSERVED_VALUES": {
-                "message": "YOU MUST INCLUDE THESE observed_value FIELDS WHEN CONSTRUCTING ROUTINE PARAMETERS!",
-                "parameters_with_observed_values": observed_values_for_params,
-            },
-            "root_transaction": None,
-            "processed_transactions": [],
-            "all_variables": {
-                "parameters": [],
-                "dynamic_tokens": [],
-                "static_values": [],
-            },
-            "resolution_map": {},
-            "summary": self._get_discovery_summary(),
-        }
-
-        # Root transaction
-        if self._discovery_state.root_transaction:
-            root = self._discovery_state.root_transaction
-            context["root_transaction"] = {
-                "transaction_id": root.transaction_id,
-                "url": root.url,
-                "method": root.method.value,
-                "description": root.description,
-            }
-
-        # Process all transaction data
-        for tx_id, tx_data in self._discovery_state.transaction_data.items():
-            tx_summary = {
-                "transaction_id": tx_id,
-                "request": tx_data.get("request"),
-                "variables": [],
-            }
-
-            if tx_data.get("extracted_variables"):
-                for var in tx_data["extracted_variables"].variables:
-                    var_info = {
-                        "name": var.name,
-                        "type": var.type.value,
-                        "observed_value": var.observed_value,
-                        "requires_resolution": var.requires_dynamic_resolution,
-                    }
-                    tx_summary["variables"].append(var_info)
-
-                    # Categorize by type
-                    if var.type == VariableType.PARAMETER:
-                        context["all_variables"]["parameters"].append(var_info)
-                    elif var.type == VariableType.DYNAMIC_TOKEN:
-                        context["all_variables"]["dynamic_tokens"].append(var_info)
-                    else:
-                        context["all_variables"]["static_values"].append(var_info)
-
-            # Add resolution info
-            if tx_data.get("resolved_variables"):
-                for resolved in tx_data["resolved_variables"]:
-                    source_info = {}
-                    if isinstance(resolved.source, SessionStorageSource):
-                        source_info = {
-                            "type": "storage",
-                            "storage_type": resolved.source.type.value,
-                            "dot_path": resolved.source.dot_path,
-                        }
-                    elif isinstance(resolved.source, WindowPropertySource):
-                        source_info = {
-                            "type": "window_property",
-                            "dot_path": resolved.source.dot_path,
-                        }
-                    elif isinstance(resolved.source, TransactionSource):
-                        source_info = {
-                            "type": "transaction",
-                            "transaction_id": resolved.source.transaction_id,
-                            "dot_path": resolved.source.dot_path,
-                        }
-
-                    context["resolution_map"][resolved.variable.name] = source_info
-
-            context["processed_transactions"].append(tx_summary)
-
-        # Completeness check
-        is_complete, blockers = self._validate_discovery_completeness()
-        context["is_complete"] = is_complete
-        context["blockers"] = blockers
-
-        return context
-
-    ## Tools - Routine Construction
-
-    @agent_tool(
-        description="Construct a routine from discovered data. After constructing, use validate_routine to test it.",
-        parameters={
-            "type": "object",
-            "properties": {
-                "routine": {
-                    "type": "object",
-                    "description": "The routine to construct.",
-                    "properties": {
-                        "name": {"type": "string", "description": "Routine name"},
-                        "description": {"type": "string", "description": "What the routine does"},
-                        "parameters": {
-                            "type": "array",
-                            "description": "Input parameters. Each needs: name, type (string|number|boolean|date|enum), description.",
-                            "items": {"type": "object"},
-                        },
-                        "operations": {
-                            "type": "array",
-                            "description": (
-                                "Ordered operations. Each needs a 'type' field: "
-                                "navigate|fetch|return|sleep|click|input_text|press|"
-                                "wait_for_url|scroll|get_cookies|download|return_html|js_evaluate. "
-                                "Key schemas — navigate: {type, url}. "
-                                "fetch: {type, endpoint: {url, method, headers?, body?}, session_storage_key}. "
-                                "return: {type, session_storage_key, tables?}. "
-                                "Use {{paramName}} placeholders in URLs/bodies for parameters."
-                            ),
-                            "items": {"type": "object"},
-                        },
-                    },
-                    "required": ["name", "description", "parameters", "operations"],
-                },
-            },
-            "required": ["routine"],
-        },
-        availability=lambda self: (
-            self._discovery_state.root_transaction is not None and
-            not self._discovery_state.transaction_queue
-        ),
-    )
-    def _construct_routine(
-        self,
-        routine: dict[str, Any],
-    ) -> dict[str, Any]:
-        """
-        Construct a routine from discovered data (no execution).
-
-        After constructing, use validate_routine to test it with parameters.
-
-        Args:
-            routine: The routine dict with name, description, parameters, and operations.
-        """
-        self._discovery_state.phase = DiscoveryPhase.CONSTRUCTING
-        self._discovery_state.construction_attempts += 1
-
-        # Reset validation state when routine is (re)constructed
-        self._discovery_state.last_validation_result = None
-        self._discovery_state.validation_analyzed = False
-        self._discovery_state.last_analysis = None
-
-        try:
-            routine_obj = Routine.model_validate(routine)
-        except Exception as e:
-            return {
-                "error": f"Invalid routine structure: {e}",
-                "message": "Failed to parse routine. Check schema in the docs and try again.",
-            }
-
-        # Get structure warnings (errors are already caught by model validation above)
-        structure_warnings = routine_obj.get_structure_warnings()
-
-        try:
-            self._discovery_state.production_routine = routine_obj
-
-            return {
-                "success": True,
-                "routine_name": routine_obj.name,
-                "parameter_count": len(routine_obj.parameters),
-                "operation_count": len(routine_obj.operations),
-                "warnings": structure_warnings,
-                "message": "Routine constructed. Now use validate_routine with test_parameters to execute and verify it works.",
-            }
-
-        except Exception as e:
-            return {
-                "error": str(e),
-                "message": "Failed to construct routine. Check schema in the docs and try again.",
-            }
-
-    @agent_tool(
-        description=(
-            "Execute the constructed routine with test parameters to validate it works. "
-            "Only available when browser is connected."
-        ),
-        parameters={
-            "type": "object",
-            "properties": {
-                "test_parameters": {
-                    "type": "object",
-                    "description": (
-                        "Test parameter values from observed data. "
-                        "Map of parameter_name -> observed_value. "
-                        "Example: {\"origin\": \"NYC\", \"destination\": \"BOS\"}. "
-                        "Get these from the extracted variables' observed_value fields."
-                    ),
-                    "additionalProperties": {"type": "string"},
-                },
-            },
-            "required": ["test_parameters"],
-        },
-        availability=lambda self: (
-            self._discovery_state.production_routine is not None and
-            self._remote_debugging_address is not None  # Require browser connection
-        ),
-    )
-    def _validate_routine(
-        self,
-        test_parameters: dict[str, str],
-    ) -> dict[str, Any]:
-        """
-        Execute the constructed routine with test parameters to validate it works.
-
-        After validation, use analyze_validation to reflect on results before calling done.
-
-        Args:
-            test_parameters: Map of parameter names to observed values for testing.
-        """
-        if not self._discovery_state.production_routine:
-            return {"error": "No routine constructed. Use construct_routine first."}
-
-        self._discovery_state.phase = DiscoveryPhase.VALIDATING
-        self._discovery_state.validation_attempts += 1
-
-        # Store test_parameters in discovery state
-        self._discovery_state.test_parameters = test_parameters
-
-        # Reset analysis state
-        self._discovery_state.validation_analyzed = False
-        self._discovery_state.last_analysis = None
-
-        routine_obj = self._discovery_state.production_routine
-
-        # Import here to avoid circular dependency
-        from bluebox.llms.tools.execute_routine_tool import execute_routine
-
-        result = execute_routine(
-            routine=routine_obj.model_dump(),
-            parameters=test_parameters,
-            remote_debugging_address=self._remote_debugging_address,
-            timeout=60,
-            close_tab_when_done=True,
-        )
-
-        # Store full result for analysis
-        if result.get("success"):
-            exec_result = result.get("result")
-            self._discovery_state.last_validation_result = {
-                "success": True,
-                "exec_result": exec_result.model_dump() if exec_result else None,
-                "data_returned": exec_result.data is not None if exec_result else False,
-            }
-
-            if exec_result and exec_result.ok and exec_result.data is not None:
-                return {
-                    "routine_name": routine_obj.name,
-                    "execution_success": True,
-                    "data_returned": True,
-                    "data_preview": str(exec_result.data)[:500],
-                    "message": "Routine executed successfully with data. Use analyze_validation to reflect on results.",
-                }
-            else:
-                return {
-                    "routine_name": routine_obj.name,
-                    "execution_success": True,
-                    "data_returned": False,
-                    "exec_result": exec_result.model_dump() if exec_result else None,
-                    "message": (
-                        "Routine executed but 'data' field is missing or empty. "
-                        "Use analyze_validation to decide next steps."
-                    ),
-                }
-        else:
-            self._discovery_state.last_validation_result = {
-                "success": False,
-                "error": result.get("error", "Unknown error"),
-            }
-            return {
-                "routine_name": routine_obj.name,
-                "execution_success": False,
-                "error": result.get("error", "Unknown error"),
-                "message": "Routine execution failed. Use analyze_validation to decide next steps.",
-            }
-
-    @agent_tool(
-        description="Analyze validation results and decide next steps. REQUIRED before calling done().",
-        parameters={
-            "type": "object",
-            "properties": {
-                "analysis": {
-                    "type": "string",
-                    "description": "Your analysis of what worked and what failed in the validation.",
-                },
-                "data_matches_task": {
-                    "type": "boolean",
-                    "description": "Does the returned data accomplish the original task the user requested?",
-                },
-                "next_action": {
-                    "type": "string",
-                    "enum": ["done", "fix_routine", "retry_validation"],
-                    "description": (
-                        "What to do next: 'done' if successful, 'fix_routine' to modify routine, "
-                        "'retry_validation' to re-run."
-                    ),
-                },
-            },
-            "required": ["analysis", "data_matches_task", "next_action"],
-        },
-        availability=lambda self: (
-            self._discovery_state.last_validation_result is not None and
-            not self._discovery_state.validation_analyzed
-        ),
-    )
-    def _analyze_validation(
-        self,
-        analysis: str,
-        data_matches_task: bool,
-        next_action: str,
-    ) -> dict[str, Any]:
-        """
-        Analyze validation results and decide next steps. Required before calling done().
-
-        Args:
-            analysis: Your analysis of what worked and what failed.
-            data_matches_task: Does the returned data accomplish the original task?
-            next_action: What to do next - 'done', 'fix_routine', or 'retry_validation'.
-        """
-        if self._discovery_state.last_validation_result is None:
-            return {"error": "No validation result to analyze. Use validate_routine first."}
-
-        # Validate next_action
-        valid_actions = ["done", "fix_routine", "retry_validation"]
-        if next_action not in valid_actions:
-            return {"error": f"Invalid next_action. Must be one of: {valid_actions}"}
-
-        # Store the analysis
-        self._discovery_state.last_analysis = {
-            "analysis": analysis,
-            "data_matches_task": data_matches_task,
-            "next_action": next_action,
-        }
-        self._discovery_state.validation_analyzed = True
-
-        # Check for inconsistency: can't say "done" if data doesn't match task
-        if next_action == "done" and not data_matches_task:
-            return {
-                "error": "Inconsistent analysis: next_action is 'done' but data_matches_task is False.",
-                "message": "If data doesn't match the task, you must fix the routine first.",
-                "hint": "Set next_action to 'fix_routine' and update the routine to return correct data.",
-            }
-
-        # Check validation result
-        validation_result = self._discovery_state.last_validation_result
-        validation_failed = not validation_result.get("success", False) and not validation_result.get("skipped", False)
-
-        if next_action == "done" and validation_failed:
-            return {
-                "error": "Cannot mark as done when validation failed.",
-                "message": "Fix the routine and re-validate before completing.",
-            }
-
-        # Return guidance based on next_action
-        if next_action == "done":
-            return {
-                "success": True,
-                "message": "Analysis recorded. You may now call done() to complete discovery.",
-                "analysis_summary": {
-                    "analysis": analysis,
-                    "data_matches_task": data_matches_task,
-                },
-            }
-        elif next_action == "fix_routine":
-            return {
-                "success": True,
-                "message": "Analysis recorded. Use construct_routine to fix the routine, then validate_routine again.",
-                "analysis_summary": {
-                    "analysis": analysis,
-                    "data_matches_task": data_matches_task,
-                },
-            }
-        else:  # retry_validation
-            # Reset for retry
-            self._discovery_state.validation_analyzed = False
-            self._discovery_state.last_analysis = None
-            return {
-                "success": True,
-                "message": "Analysis recorded. Use validate_routine to retry validation.",
-                "analysis_summary": {
-                    "analysis": analysis,
-                    "data_matches_task": data_matches_task,
-                },
-            }
-
-    ## Tools - Completion
-
-    def _can_complete(self) -> bool:
-        """Check if discovery can be marked complete."""
-        # Must have a routine
-        if not self._discovery_state.production_routine:
-            return False
-
-        # If no browser connected, can complete without validation
-        # (we can't execute routines without a browser)
-        if not self._remote_debugging_address:
-            return True
-
-        # With browser: must have validated and analyzed successfully
-        if not self._discovery_state.validation_analyzed:
-            return False
-
-        analysis = self._discovery_state.last_analysis
-        if not analysis:
-            return False
-
-        return analysis.get("data_matches_task", False)
-
-    @agent_tool(
-        availability=lambda self: self._can_complete(),
-    )
-    def _done(self) -> dict[str, Any]:
-        """Mark discovery as complete. Available after construct_routine (no browser) or successful analyze_validation (with browser)."""
-        if not self._discovery_state.production_routine:
-            return {"error": "No routine constructed. Use construct_routine first."}
-
-        # If browser connected, require successful validation analysis
-        if self._remote_debugging_address:
-            if not self._discovery_state.validation_analyzed:
-                return {"error": "Validation not analyzed. Use validate_routine then analyze_validation first."}
-
-            analysis = self._discovery_state.last_analysis
-            if not analysis:
-                return {"error": "No analysis found. Use analyze_validation first."}
-
-            if not analysis.get("data_matches_task", False):
-                return {
-                    "error": "Cannot complete when data doesn't match task.",
-                    "message": "Fix the routine with construct_routine, then validate_routine and analyze_validation again.",
-                }
-
-        self._discovery_state.phase = DiscoveryPhase.COMPLETE
-        self._final_routine = self._discovery_state.production_routine
-
-        # Note if routine was not validated
-        message = "Discovery completed"
-        if not self._remote_debugging_address:
-            message += " (routine not validated - no browser connected)"
-
-        return {
-            "success": True,
-            "message": message,
-            "routine_name": self._final_routine.name,
-        }
-
-    @agent_tool(
-        availability=lambda self: (
-            self._discovery_state.root_transaction is None
-            or self._discovery_state.construction_attempts >= 5
-        ),
-    )
-    def _fail(self, reason: str) -> dict[str, Any]:
-        """
-        Mark discovery as failed.
-
-        Args:
-            reason: Why discovery could not be completed.
-        """
-        self._discovery_state.phase = DiscoveryPhase.FAILED
-        self._failure_reason = reason
-        return {
-            "success": False,
-            "message": "Discovery marked as failed",
-            "reason": reason,
-        }
diff --git a/bluebox/agents/routine_inspector.py b/bluebox/agents/routine_inspector.py
new file mode 100644
index 00000000..3f0aa9ee
--- /dev/null
+++ b/bluebox/agents/routine_inspector.py
@@ -0,0 +1,314 @@
+"""
+bluebox/agents/routine_inspector.py
+
+RoutineInspector — independent quality gate for constructed routines.
+
+The inspector receives ALL context in the task prompt and returns a structured
+RoutineInspectionResult. It has no knowledge of the discovery process — it
+judges the OUTPUT, not the PROCESS. When equipped with documentation tools,
+it can search common-issues docs to provide specific remediation advice.
+
+Think of it as a peer reviewer: reads the routine cold, checks if the claims
+hold up, and decides: publish, revise, or reject.
+"""
+
+from __future__ import annotations
+
+from textwrap import dedent
+from typing import Callable, TYPE_CHECKING
+
+from bluebox.agents.abstract_agent import AbstractAgent, AgentCard
+from bluebox.workspace import AgentWorkspace
+from bluebox.data_models.llms.interaction import (
+    Chat,
+    ChatThread,
+    EmittedMessage,
+)
+from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
+from bluebox.data_models.orchestration.inspection import RoutineInspectionResult
+from bluebox.data_models.orchestration.result import SpecialistResultWrapper
+from bluebox.utils.logger import get_logger
+
+if TYPE_CHECKING:
+    from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader
+
+logger = get_logger(name=__name__)
+
+
+class RoutineInspector(AbstractAgent):
+    """
+    Independent quality gate for constructed routines.
+
+    Receives routine + execution result + exploration context as the task prompt,
+    scores on 6 dimensions, and returns a RoutineInspectionResult via
+    finalize_with_output. Has optional access to documentation tools to provide
+    specific, actionable remediation advice in recommendations.
+    """
+
+    AGENT_CARD = AgentCard(
+        description=(
+            "Independent quality gate that judges constructed routines on 6 dimensions: "
+            "task completion, data quality, parameter coverage, routine robustness, "
+            "structural correctness, and documentation quality. Can reference routine "
+            "documentation to provide actionable fix recommendations."
+        ),
+    )
+    SYSTEM_PROMPT: str = dedent("""\
+        You are a routine quality inspector. You judge routines objectively.
+
+        You have NO knowledge of how the routine was built. You only see:
+        - The user's task
+        - The routine JSON
+        - The execution result (if available)
+        - Exploration summaries (what the site looks like)
+
+        Your job: score the routine and decide if it ships.
+    """)
+
+    AUTONOMOUS_SYSTEM_PROMPT: str = dedent("""\
+        You are an independent routine quality inspector. You receive a routine
+        and must judge whether it correctly accomplishes its own stated purpose
+        (name + description). Do NOT judge it against any broader project goal —
+        only against what the routine itself claims to do.
+
+        ## CRITICAL: Judge ACTUAL Results, Not Hypotheticals
+
+        You score based on WHAT ACTUALLY HAPPENED, not what "would work if...".
+        If the execution returned a 401, the routine FAILED. Period. You do not
+        get to say "it would return rich data with valid credentials" — that is
+        speculation, not inspection. A routine that doesn't work doesn't ship.
+
+        **Automatic failure signals (ANY of these → task_completion ≤ 2, data_quality ≤ 2):**
+        - HTTP 4xx or 5xx status codes in ANY operation response
+        - Unresolved placeholders (e.g. "Could not resolve placeholder: ...")
+        - Error messages in the response body (e.g. "Access denied", "Unauthorized",
+          "Invalid", "Forbidden", "Not found")
+        - Test parameters containing obvious placeholder values like "REPLACE_WITH_...",
+          "YOUR_..._HERE", "TODO", "FIXME" — this means the routine can't be tested
+        - Empty or null response data when the routine promises to return something
+        - The execution_result.data containing an error object instead of real data
+
+        **You are a quality gate, not a cheerleader.** Your job is to BLOCK bad routines
+        from shipping. If you let a broken routine through, it pollutes the database
+        and wastes other agents' time. When in doubt, FAIL it.
+
+        ## CRITICAL: Spec Description Downgrade Detection
+
+        When the inspection prompt includes a "Spec vs Routine Description Comparison"
+        section, you MUST check whether the routine's own description has been watered
+        down from the original spec. If the spec promises rich, detailed data but the
+        routine description claims to return only minimal fields, this is a BLOCKING issue:
+
+        - Add blocking issue: "Routine description is significantly weaker than the spec
+          description. Spec promises: '<spec desc>'. Routine claims: '<routine desc>'.
+          The routine must deliver on the original spec or the spec should be updated."
+        - Cap task_completion at 4 — the routine may work for what it claims, but it
+          does NOT fulfill the originally planned capability.
+        - Cap data_quality at 4 — returning 2 fields when 15 were promised is not
+          quality data.
+
+        ## Scoring Rubric (6 dimensions, 0-10 each)
+
+        1. **Task Completion** — Does the returned data ACTUALLY accomplish what
+           the routine's name and description promise? Check the REAL execution result.
+           - Did the routine return the data it claims to return? Not "could it" — DID IT?
+           - A flight search that returned a 401 error did NOT return flights → score 0-2
+           - A standings routine that returned an HTML error page did NOT return standings → score 0-2
+           - ONLY score above 5 if the execution result contains ACTUAL meaningful data
+             that matches what the routine promises
+
+        2. **Data Quality** — Is the ACTUAL response complete and meaningful?
+           - Check the REAL response data, not what you imagine it could contain
+           - A 401/403/500 response has ZERO data quality regardless of how "correct"
+             the request structure looks → score 0-2
+           - An error message body is not "data" → score 0
+           - Truncated, empty, or missing data → score 0-3
+           - ONLY score above 5 if the response contains REAL, COMPLETE, MEANINGFUL data
+
+        3. **Parameter Coverage** — Are the right values parameterized? Any hardcoded
+           values that should be params (dates, search terms, IDs)? Any unnecessary
+           params that could be hardcoded?
+
+        4. **Routine Robustness** — Would this work in a fresh session? Are dynamic
+           tokens properly resolved via placeholders (not hardcoded expired values)?
+           Does it handle auth correctly (navigate first to establish cookies/tokens
+           before making API calls)?
+           - If any placeholder failed to resolve → score ≤ 4
+           - If auth tokens are not properly obtained → score ≤ 3
+
+        5. **Structural Correctness** — Navigate before fetch? Dependencies before
+           dependents? Consistent session_storage_key usage (write before read)?
+           Valid placeholder types? Operations in correct order?
+
+        6. **Documentation Quality** — CRITICAL: These routines will be vectorized and
+           stored in databases for other agents to discover via semantic search.
+           Score strictly:
+
+           **Routine name** (0-3 points):
+           - Must be snake_case with verb_site_noun pattern, ≥3 segments
+           - MUST include the site/service name so the name makes sense in isolation
+             to an agent that has never seen this routine before
+           - GOOD: get_premierleague_standings, search_amtrak_trains, fetch_espn_scores
+           - BAD: get_standings (from where?), get_content_item (what content? what site?),
+             fetch_data (completely generic), search_matches (which sport? which site?)
+           - 0 = missing/generic/no site context, 1 = has site but vague noun,
+             2 = decent with site + noun, 3 = precise verb_site_noun with clear specificity
+
+           **Routine description** (0-4 points):
+           - Must be ≥8 words
+           - Must explain: (a) what it does, (b) what inputs it takes, (c) what data it returns
+           - Example of 4/4: "Fetches Premier League standings for a given competition ID
+             and season ID, returning team names, positions, points, and goal difference."
+           - 0 = missing/useless, 1 = says what it does only, 2 = adds inputs, 3 = adds outputs, 4 = complete
+
+           **Parameter descriptions** (0-3 points):
+           - Every parameter must have a description of ≥3 words
+           - Should explain what the value represents AND its expected format/range
+           - CRITICAL for non-obvious parameters (opaque IDs, slugs, codes, UUIDs):
+             The description MUST explain WHERE to get the value. If the user can't
+             google it, the description must say how to obtain it — e.g. which other
+             routine or API endpoint provides valid values.
+           - Example of 3/3: "Internal competition ID. Obtain from the get_competitions
+             routine or the /competitions endpoint. Example: 1 = Premier League."
+           - Example of 2/3: "The unique competition identifier (e.g. 1 for Premier League)"
+             (good but doesn't say where to get other valid IDs)
+           - Example of 0/3: "ID" or "the season"
+           - 0 = missing descriptions, 1 = all present but terse, 2 = mostly good, 3 = all
+             excellent with sourcing info for non-obvious params
+
+           A score ≤4 in documentation_quality is a BLOCKING issue — the routine cannot
+           ship with poor metadata because it will be invisible to other agents.
+
+        ## Verdict Rules
+
+        - overall_pass = True if: no blocking_issues AND overall_score >= 60
+        - overall_score = round(sum of all 6 dimension scores / 60 × 100) (max 100)
+        - documentation_quality ≤ 4 → add blocking issue: "Documentation quality too low
+          for vectorized storage — fix routine name, description, or parameter descriptions"
+        - ANY HTTP 4xx/5xx in execution → add blocking issue describing the failure
+        - ANY unresolved placeholder → add blocking issue describing which placeholder failed
+        - Be STRICT on all dimensions. A broken routine is WORSE than no routine — it
+          wastes database space and misleads other agents. Only pass routines that
+          ACTUALLY WORK with REAL DATA in the execution result.
+
+        ## Documentation-Backed Recommendations
+
+        When you have access to documentation tools (search_files, read_file),
+        use them to provide SPECIFIC, actionable remediation advice in your
+        recommendations. Don't just say "fix the auth" — search for the relevant
+        doc and cite the exact fix pattern.
+
+        Common patterns to search for:
+        - "TypeError: Failed to fetch" → search_files(scope="docs", query="cors-failed-to-fetch", mode="exact") →
+          the fix is adding a navigate operation to the allowed origin
+        - 401/403 errors → search_files(scope="docs", query="unauthenticated", mode="exact") → the fix is adding
+          auth token fetch + js_evaluate extraction before data fetches
+        - Placeholder issues → search_files(scope="docs", query="placeholder-not-resolved", mode="exact") →
+          check placeholder syntax and resolution types
+        - HTML instead of JSON → search_files(scope="docs", query="fetch-returns-html", mode="exact") → wrong URL
+          or CORS redirect
+
+        Your recommendations should include: (1) what's wrong, (2) the specific
+        fix from documentation with example operations if applicable.
+
+        IMPORTANT: Only search docs when you identify a blocking issue that has
+        a known fix pattern. Do NOT search docs for every inspection — only when
+        you can provide actionable remediation. Keep doc searches to 1-2 max per
+        inspection to stay within iteration limits.
+
+        ## Process
+
+        1. Read the routine name and description — this is what you're scoring against
+        2. Read the routine JSON — understand each operation's purpose
+        3. Read the execution result — **DID IT ACTUALLY WORK?** Check EVERY operation's
+           HTTP status code. Check for unresolved placeholders. Check for error messages.
+           This is the MOST IMPORTANT step. If the execution failed, the routine fails.
+        4. Cross-reference with exploration summaries — does the data match?
+        5. Score each dimension with specific reasoning based on ACTUAL results
+        6. List blocking issues (MUST fix) and recommendations (SHOULD fix)
+           - If docs are available and you identified a fixable issue, search the
+             common-issues docs to include a specific fix in recommendations
+        7. Write a 2-3 sentence summary
+        8. Call finalize_with_output with the complete inspection result
+    """)
+
+    # -----------------------------------------------------------------------
+    # Constructor
+    # -----------------------------------------------------------------------
+
+    def __init__(
+        self,
+        emit_message_callable: Callable[[EmittedMessage], None],
+        persist_chat_callable: Callable[[Chat], Chat] | None = None,
+        persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None,
+        stream_chunk_callable: Callable[[str], None] | None = None,
+        llm_model: LLMModel = OpenAIModel.GPT_5_1,
+        chat_thread: ChatThread | None = None,
+        existing_chats: list[Chat] | None = None,
+        documentation_data_loader: DocumentationDataLoader | None = None,
+        workspace: AgentWorkspace | None = None,
+    ) -> None:
+        super().__init__(
+            emit_message_callable=emit_message_callable,
+            workspace=workspace,
+            persist_chat_callable=persist_chat_callable,
+            persist_chat_thread_callable=persist_chat_thread_callable,
+            stream_chunk_callable=stream_chunk_callable,
+            llm_model=llm_model,
+            chat_thread=chat_thread,
+            existing_chats=existing_chats,
+            documentation_data_loader=documentation_data_loader,
+            allow_code_execution=True,
+        )
+        logger.debug("RoutineInspector initialized")
+
+    # -----------------------------------------------------------------------
+    # Abstract method implementations
+    # -----------------------------------------------------------------------
+
+    def _get_system_prompt(self) -> str:
+        return self.SYSTEM_PROMPT + self._generate_code_execution_prompt()
+
+    def _get_autonomous_system_prompt(self) -> str:
+        return (
+            self.AUTONOMOUS_SYSTEM_PROMPT
+            + self._get_output_schema_prompt_section()
+            + self._generate_code_execution_prompt()
+            + self._get_documentation_prompt_section()
+            + self._get_urgency_notice()
+        )
+
+    def _get_autonomous_initial_message(self, task: str) -> str:
+        return (
+            f"INSPECTION REQUEST:\n\n{task}\n\n"
+            "Score this routine on all 6 dimensions (including documentation_quality), "
+            "identify blocking issues vs. recommendations, and call finalize_with_output "
+            "with the complete RoutineInspectionResult.\n\n"
+            "CRITICAL REMINDERS:\n"
+            "1. CHECK THE EXECUTION RESULT FIRST. If ANY operation returned HTTP 4xx/5xx, "
+            "the routine FAILED. Score task_completion and data_quality ≤ 2. Do NOT "
+            "speculate about what 'would work' — judge what ACTUALLY happened.\n"
+            "2. Check for unresolved placeholders in warnings — these are automatic failures.\n"
+            "3. Check test_parameters for placeholder values like 'REPLACE_WITH_...' — "
+            "if the routine wasn't tested with real inputs, it cannot pass.\n"
+            "4. Documentation quality: score name, description, and parameter descriptions "
+            "strictly. documentation_quality ≤ 4 is a blocking issue."
+        )
+
+    def _get_autonomous_result(self) -> SpecialistResultWrapper | None:
+        """
+        Return autonomous result with normalized/clamped inspection scores.
+        """
+        result = super()._get_autonomous_result()
+        if not isinstance(result, SpecialistResultWrapper):
+            return result
+        if not result.success or not isinstance(result.output, dict):
+            return result
+
+        try:
+            normalized = RoutineInspectionResult.model_validate(result.output)
+            result.output = normalized.model_dump(mode="json")
+        except Exception:
+            # Keep raw output if normalization fails.
+            pass
+        return result
diff --git a/bluebox/agents/specialists/__init__.py b/bluebox/agents/specialists/__init__.py
index 0871eb17..7b7debec 100644
--- a/bluebox/agents/specialists/__init__.py
+++ b/bluebox/agents/specialists/__init__.py
@@ -1,31 +1,26 @@
 """
 bluebox/agents/specialists/__init__.py
 
-NOTE: This file is necessary because it triggers AbstractSpecialist.__init_subclass__
-for all specialist classes, populating AbstractSpecialist._subclasses list.
-This enables using AbstractSpecialist.get_all_subclasses() to discover specialists.
+NOTE: This file imports specialist classes so AbstractAgent subclass registration
+runs for each concrete specialist at import time.
 """
 
-from bluebox.agents.specialists.abstract_specialist import (
-    AbstractSpecialist,
-    AutonomousConfig,
-    RunMode,
-)
-from bluebox.agents.abstract_agent import agent_tool
+from bluebox.agents.abstract_agent import AgentExecutionMode, AutonomousRunConfig, agent_tool
 
-# Import all specialist classes to trigger AbstractSpecialist.__init_subclass__
+# Import all specialist classes so AbstractAgent.__init_subclass__ registers them
+from bluebox.agents.specialists.dom_specialist import DOMSpecialist
 from bluebox.agents.specialists.interaction_specialist import InteractionSpecialist
 from bluebox.agents.specialists.js_specialist import JSSpecialist
 from bluebox.agents.specialists.network_specialist import NetworkSpecialist
 from bluebox.agents.specialists.value_trace_resolver_specialist import ValueTraceResolverSpecialist
 
 __all__ = [
-    # Base class and utilities
-    "AbstractSpecialist",
-    "AutonomousConfig",
-    "RunMode",
+    # Utilities
+    "AutonomousRunConfig",
+    "AgentExecutionMode",
     "agent_tool",
     # Concrete specialists
+    "DOMSpecialist",
     "InteractionSpecialist",
     "JSSpecialist",
     "NetworkSpecialist",
diff --git a/bluebox/agents/specialists/abstract_specialist.py b/bluebox/agents/specialists/abstract_specialist.py
deleted file mode 100644
index cacca874..00000000
--- a/bluebox/agents/specialists/abstract_specialist.py
+++ /dev/null
@@ -1,606 +0,0 @@
-"""
-bluebox/agents/specialists/abstract_specialist.py
-
-Abstract base class for specialist agents.
-
-Specialists are domain-expert agents that an orchestrator deploys for specific tasks.
-Each specialist owns:
-  - A system prompt (conversational + autonomous variants)
-  - A set of LLM tools and their execution logic
-  - Finalize tools for autonomous mode (registered after min_iterations)
-
-This class extends AbstractAgent to add:
-  - Autonomous mode with iteration tracking and finalize gating
-  - Conversational mode for interactive chat
-
-Tools are defined declaratively via the @agent_tool decorator.
-"""
-
-from __future__ import annotations
-
-import json
-from abc import abstractmethod
-from enum import StrEnum
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, NamedTuple
-
-import jsonschema
-from pydantic import BaseModel
-
-from bluebox.agents.abstract_agent import AbstractAgent, agent_tool
-from bluebox.data_models.orchestration.result import SpecialistResultWrapper
-from bluebox.utils.llm_utils import token_optimized
-from bluebox.data_models.llms.interaction import (
-    Chat,
-    ChatRole,
-    ChatThread,
-    EmittedMessage,
-    ChatResponseEmittedMessage,
-    ErrorEmittedMessage,
-)
-from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
-from bluebox.utils.logger import get_logger
-
-if TYPE_CHECKING:
-    from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader
-
-logger = get_logger(name=__name__)
-
-
-
-class RunMode(StrEnum):
-    """How the specialist is being run."""
-    CONVERSATIONAL = "conversational"  # interactive chat with a user
-    AUTONOMOUS = "autonomous"          # autonomous loop (exploration + finalization)
-
-
-class AutonomousConfig(NamedTuple):
-    """
-    Configuration for autonomous specialist runs. Helps manage their "lifecycles."
-    """
-    min_iterations: int = 3   # Minimum iterations before finalize tools become available
-    max_iterations: int = 10  # Maximum iterations before loop exits (returns None if not finalized)
-
-
-class AbstractSpecialist(AbstractAgent):
-    """
-    Abstract base class for specialist agents.
-
-    Subclasses implement domain-specific logic by overriding:
-      - _get_system_prompt()
-      - _get_autonomous_system_prompt()
-      - _get_autonomous_initial_message()
-      - _check_autonomous_completion() — inspect tool results for finalize signals
-
-    Tools are defined declaratively via the @agent_tool decorator on handler
-    methods. Each tool's ``availability`` controls when it is registered: True
-    (always), or a callable evaluated before each LLM call.
-
-    This class extends AbstractAgent with:
-      - Autonomous mode with iteration tracking and finalize gating
-      - Conversational mode for interactive chat
-    """
-
-    ## Class-level tracking of all specialist subclasses
-    _subclasses: ClassVar[list[type[AbstractSpecialist]]] = []
-
-    ## Magic methods
-
-    def __init__(
-        self,
-        emit_message_callable: Callable[[EmittedMessage], None],
-        persist_chat_callable: Callable[[Chat], Chat] | None = None,
-        persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None,
-        stream_chunk_callable: Callable[[str], None] | None = None,
-        llm_model: LLMModel = OpenAIModel.GPT_5_2,
-        run_mode: RunMode = RunMode.CONVERSATIONAL,
-        chat_thread: ChatThread | None = None,
-        existing_chats: list[Chat] | None = None,
-        documentation_data_loader: DocumentationDataLoader | None = None,
-    ) -> None:
-        """
-        Initialize the specialist.
-
-        Args:
-            emit_message_callable: Callback to emit messages to the host.
-            persist_chat_callable: Optional callback to persist Chat objects.
-            persist_chat_thread_callable: Optional callback to persist ChatThread.
-            stream_chunk_callable: Optional callback for streaming text chunks.
-            llm_model: The LLM model to use.
-            run_mode: How the specialist will be run (conversational or autonomous).
-            chat_thread: Existing ChatThread to continue, or None for new.
-            existing_chats: Existing Chat messages if loading from persistence.
-            documentation_data_loader: Optional DocumentationDataLoader for docs/code search tools.
-        """
-        # lifecycle state (must be set before parent __init__, which calls _sync_tools)
-        self.run_mode: RunMode = run_mode
-        self._autonomous_iteration: int = 0
-        self._autonomous_config: AutonomousConfig = AutonomousConfig()
-
-        # orchestrator-defined output schema (set via set_output_schema())
-        self._task_output_schema: dict[str, Any] | None = None
-        self._task_output_description: str | None = None
-        self._notes: list[str] = []
-        self._wrapped_result: SpecialistResultWrapper | None = None
-
-        # call parent init
-        super().__init__(
-            emit_message_callable=emit_message_callable,
-            persist_chat_callable=persist_chat_callable,
-            persist_chat_thread_callable=persist_chat_thread_callable,
-            stream_chunk_callable=stream_chunk_callable,
-            llm_model=llm_model,
-            chat_thread=chat_thread,
-            existing_chats=existing_chats,
-            documentation_data_loader=documentation_data_loader,
-        )
-
-    def __init_subclass__(cls: type[AbstractSpecialist], **kwargs: NamedTuple) -> None:
-        """Register subclass when it's defined."""
-        super().__init_subclass__(**kwargs)
-        # Only register concrete specialists (not intermediate ABCs)
-        if not cls.__name__.startswith("Abstract"):
-            cls._subclasses.append(cls)
-
-    ## Class methods
-
-    @classmethod
-    def get_all_subclasses(cls) -> list[type[AbstractSpecialist]]:
-        """Return a copy of all registered specialist subclasses."""
-        return cls._subclasses.copy()
-
-    @classmethod
-    def get_by_type(cls, agent_type: str) -> type[AbstractSpecialist] | None:
-        """
-        Look up a specialist class by name.
-
-        Args:
-            agent_type: The class name (e.g., "NetworkSpecialist", "JSSpecialist").
-
-        Returns:
-            The specialist class, or None if not found.
-        """
-        for subclass in cls._subclasses:
-            if subclass.__name__ == agent_type:
-                return subclass
-        return None
-
-    @classmethod
-    def get_all_agent_types(cls) -> list[str]:
-        """Return all registered specialist class names."""
-        return [subclass.__name__ for subclass in cls._subclasses]
-
-    ## Additional abstract methods for autonomous mode
-
-    @abstractmethod
-    def _get_autonomous_system_prompt(self) -> str:
-        """
-        Return the system prompt for autonomous mode.
-
-        Called every iteration, so it can include dynamic context
-        (e.g., iteration count, urgency notices).
-        """
-
-    @abstractmethod
-    def _get_autonomous_initial_message(self, task: str) -> str:
-        """
-        Build the initial USER message for autonomous mode.
-
-        Args:
-            task: The user's task description.
-
-        Returns:
-            Message string to seed the autonomous conversation.
-        """
-
-    def _check_autonomous_completion(self, tool_name: str) -> bool:
-        """
-        Check whether a tool call signals autonomous completion.
-
-        Called after each tool execution in the autonomous loop.
-        Return True to stop the loop (e.g., finalize_result was called
-        and self._autonomous_result is now set).
-
-        Default implementation checks for the generic finalize tools:
-        - finalize_with_output, finalize_with_failure (with output schema)
-        - finalize_result, finalize_failure (without output schema)
-
-        Subclasses should override this and call super() to also check
-        for their own specialist-specific finalize tools.
-
-        Args:
-            tool_name: Name of the tool that was just executed.
-
-        Returns:
-            True if the autonomous loop should stop.
-        """
-        # Check for generic finalize tools (both with-schema and without-schema variants)
-        finalize_tools = (
-            "finalize_with_output",
-            "finalize_with_failure",
-            "finalize_result",
-            "finalize_failure",
-        )
-        if tool_name in finalize_tools:
-            return self._wrapped_result is not None
-        return False
-
-    def _get_autonomous_result(self) -> BaseModel | None:
-        """
-        Return the autonomous mode result after the loop completes.
-
-        Default implementation returns the wrapped result if set via the
-        generic finalize tools. Subclasses should override this and check
-        for _wrapped_result first, then fall back to their own result types.
-
-        Returns:
-            A Pydantic model with the specialist's result,
-            or None if max iterations were reached without finalization.
-        """
-        return self._wrapped_result
-
-    ## Properties
-
-    @property
-    def autonomous_iteration(self) -> int:
-        """Return the current/final autonomous iteration count."""
-        return self._autonomous_iteration
-
-    @property
-    def can_finalize(self) -> bool:
-        """
-        Whether "finalize tools" should be available (autonomous mode, past min_iterations).
-
-        Returns:
-            True if the specialist is in autonomous mode and has exceeded the min_iterations threshold, False otherwise.
-        """
-        return (
-            self.run_mode == RunMode.AUTONOMOUS
-            and self._autonomous_iteration >= self._autonomous_config.min_iterations
-        )
-
-    @property
-    def has_output_schema(self) -> bool:
-        """Whether an output schema has been set by the orchestrator."""
-        return self._task_output_schema is not None
-
-    ## Output Schema Methods
-
-    def set_output_schema(
-        self,
-        schema: dict[str, Any],
-        description: str | None = None,
-    ) -> None:
-        """
-        Set the expected output schema for this task.
-
-        Called by the orchestrator before running the specialist to define
-        what structure the specialist should return.
-
-        Args:
-            schema: JSON Schema defining the expected output structure.
-            description: Human-readable description of what to return.
-        """
-        self._task_output_schema = schema
-        self._task_output_description = description
-
-    def _get_output_schema_prompt_section(self) -> str:
-        """
-        Get the output schema section to include in autonomous system prompt.
-
-        Subclasses should call this and include it in their _get_autonomous_system_prompt().
-
-        Returns:
-            Formatted prompt section describing expected output, or empty string if no schema set.
-        """
-        if not self._task_output_schema:
-            return ""
-
-        parts = ["\n\n## Expected Output Schema\n"]
-
-        if self._task_output_description:
-            parts.append(f"**Description:** {self._task_output_description}\n\n")
-
-        parts.append("**Schema:**\n```json\n")
-        parts.append(json.dumps(self._task_output_schema, indent=2))
-        parts.append("\n```\n")
-
-        parts.append(
-            "\nWhen ready, call `finalize_with_output(output={...})` with data matching this schema. "
-            "Use `add_note()` before finalizing to record any notes, complaints, warnings, or errors."
-        )
-
-        return "".join(parts)
-
-    def _get_urgency_notice(self) -> str:
-        """
-        Iteration-aware urgency notice for autonomous system prompts.
-
-        Appended to autonomous prompts to nudge the LLM toward finalizing.
-        Replaces the per-specialist urgency logic that was previously duplicated.
-        """
-        finalize_tool = "finalize_with_output" if self.has_output_schema else "finalize_result"
-
-        if self.can_finalize:
-            remaining = self._autonomous_config.max_iterations - self._autonomous_iteration
-            if remaining <= 2:
-                return f"\n\n## URGENT: Only {remaining} iteration(s) left — call `{finalize_tool}` NOW."
-            if remaining <= 4:
-                return f"\n\n## Finalize soon — {remaining} iterations remaining."
-            return f"\n\n## `{finalize_tool}` is now available."
-        return f"\n\n## Continue exploring (iteration {self._autonomous_iteration})."
-
-    @agent_tool
-    def add_note(self, note: str) -> dict[str, Any]:
-        """
-        Add a note to the result wrapper.
-
-        Use this for notes, complaints, warnings, or errors encountered during execution.
-        These are passed back to the orchestrator along with the result.
-
-        Args:
-            note: The note/complaint/warning/error message.
-        """
-        self._notes.append(note)
-        return {"status": "ok", "total_notes": len(self._notes)}
-
-    ## Generic Finalize Tool (for orchestrator-defined schemas)
-
-    @agent_tool(availability=lambda self: self.can_finalize and self.has_output_schema)
-    @token_optimized
-    def _finalize_with_output(self, output: dict[str, Any]) -> dict[str, Any]:
-        """
-        Finalize with output matching the orchestrator's expected schema.
-
-        This tool is available when the orchestrator has defined an output schema
-        for the task. The output must match the schema or validation will fail.
-
-        Args:
-            output: Result data matching the expected output schema.
-        """
-        if not self._task_output_schema:
-            return {"error": "No output schema defined for this task"}
-
-        # Validate against schema
-        try:
-            jsonschema.validate(instance=output, schema=self._task_output_schema)
-        except jsonschema.ValidationError as e:
-            return {
-                "error": "Output does not match expected schema",
-                "validation_error": str(e.message),
-                "schema_path": list(e.absolute_schema_path),
-                "hint": "Fix the output structure and try again.",
-            }
-
-        # Store the wrapped result
-        self._wrapped_result = SpecialistResultWrapper(
-            output=output,
-            success=True,
-            notes=self._notes.copy(),
-            failure_reason=None,
-        )
-
-        logger.info("Specialist finalized with output matching schema")
-        return {
-            "status": "success",
-            "message": "Output validated and stored successfully",
-            "notes_count": len(self._notes),
-        }
-
-    @agent_tool(availability=lambda self: self.can_finalize and self.has_output_schema)
-    @token_optimized
-    def _finalize_with_failure(self, reason: str) -> dict[str, Any]:
-        """
-        Finalize with failure when the task cannot be completed.
-
-        Use this when you cannot produce the expected output after thorough analysis.
-
-        Args:
-            reason: Explanation of why the task could not be completed.
-        """
-        self._wrapped_result = SpecialistResultWrapper(
-            output=None,
-            success=False,
-            notes=self._notes.copy(),
-            failure_reason=reason,
-        )
-
-        logger.info("Specialist finalized with failure: %s", reason)
-        return {
-            "status": "failure",
-            "message": "Task marked as failed",
-            "reason": reason,
-        }
-
-    ## Generic Finalize Tools (for tasks without output schema)
-
-    @agent_tool(availability=lambda self: self.can_finalize and not self.has_output_schema)
-    @token_optimized
-    def _finalize_result(self, output: dict[str, Any]) -> dict[str, Any]:
-        """
-        Finalize and return the result of your analysis.
-
-        Use this to submit your findings when you have completed the task.
-        The output should contain all relevant information discovered.
-
-        Args:
-            output: Dictionary containing your findings and analysis results.
-        """
-        self._wrapped_result = SpecialistResultWrapper(
-            output=output,
-            success=True,
-            notes=self._notes.copy(),
-        )
-
-        logger.info("Specialist finalized with result (no schema)")
-        return {
-            "status": "success",
-            "message": "Result submitted successfully",
-            "notes_count": len(self._notes),
-        }
-
-    @agent_tool(availability=lambda self: self.can_finalize and not self.has_output_schema)
-    @token_optimized
-    def _finalize_failure(self, reason: str) -> dict[str, Any]:
-        """
-        Finalize with failure when the task cannot be completed.
-
-        Use this when you cannot produce results after thorough analysis.
-
-        Args:
-            reason: Explanation of why the task could not be completed.
-        """
-        self._wrapped_result = SpecialistResultWrapper(
-            output=None,
-            success=False,
-            notes=self._notes.copy(),
-            failure_reason=reason,
-        )
-
-        logger.info("Specialist finalized with failure (no schema): %s", reason)
-        return {
-            "status": "failure",
-            "message": "Task marked as failed",
-            "reason": reason,
-        }
-
-    ## Public API
-
-    def run_autonomous(
-        self,
-        task: str,
-        config: AutonomousConfig | None = None,
-        output_schema: dict[str, Any] | None = None,
-        output_description: str | None = None,
-    ) -> BaseModel | None:
-        """
-        Run the specialist autonomously to completion.
-
-        The specialist will:
-        1. Use its tools to explore and analyze data
-        2. After min_iterations, finalize tools become available (via can_finalize)
-        3. Return a typed result when finalize is called, or None on timeout
-
-        Args:
-            task: User task description.
-            config: Autonomous run configuration (iterations limits). Uses defaults if None.
-            output_schema: JSON Schema defining expected output structure.
-            output_description: Human-readable description of expected output.
-
-        Returns:
-            Specialist-specific result model, or None if max iterations reached.
-        """
-        self.run_mode = RunMode.AUTONOMOUS
-        self._autonomous_iteration = 0
-        self._autonomous_config = config or AutonomousConfig()
-
-        # Subclass should reset its own result fields in _reset_autonomous_state()
-        self._reset_autonomous_state()
-
-        # Set output schema AFTER reset (so it doesn't get cleared)
-        if output_schema:
-            self.set_output_schema(output_schema, output_description)
-
-        # Seed the conversation
-        initial_message = self._get_autonomous_initial_message(task)
-        self._add_chat(ChatRole.USER, initial_message)
-
-        logger.info(
-            "Starting %s autonomous run for task: %s",
-            self.__class__.__name__, task,
-        )
-
-        self._run_autonomous_loop()
-
-        self.run_mode = RunMode.CONVERSATIONAL
-
-        return self._get_autonomous_result()
-
-    def _reset_autonomous_state(self) -> None:
-        """
-        Reset autonomous-mode state before a new run.
-
-        Override in subclasses to clear specialist-specific result fields
-        (e.g., self._discovery_result = None). Call super() first.
-
-        NOTE: Method is not abstract; it is intentionally a no-op by default. Not every specialist
-        has extra autonomous state to reset; those that don't simply inherit this.
-        """
-        # Clear orchestrator-defined output schema state
-        self._task_output_schema = None
-        self._task_output_description = None
-        self._notes = []
-        self._wrapped_result = None
-
-    def reset(self) -> None:
-        """Reset the conversation to a fresh state."""
-        # Reset autonomous state
-        self.run_mode = RunMode.CONVERSATIONAL
-        self._autonomous_iteration = 0
-        self._reset_autonomous_state()
-
-        # Call parent reset
-        super().reset()
-
-    ## Agent loops
-
-    def _run_autonomous_loop(self) -> None:
-        """Run the autonomous agent loop with iteration tracking and finalize gating."""
-        max_iterations = self._autonomous_config.max_iterations
-        for iteration in range(max_iterations):
-            self._autonomous_iteration = iteration + 1
-            logger.debug("Autonomous loop iteration %d/%d", self._autonomous_iteration, max_iterations)
-
-            messages = self._build_messages_for_llm()
-            try:
-                # Use tool_choice="required" to force the LLM to always call a tool
-                # This prevents the loop from exiting due to text-only responses
-                response = self._call_llm(
-                    messages,
-                    self._get_autonomous_system_prompt(),
-                    tool_choice="required",
-                )
-
-                if response.response_id:
-                    self._previous_response_id = response.response_id
-
-                if response.content or response.tool_calls:
-                    chat = self._add_chat(
-                        role=ChatRole.ASSISTANT,
-                        content=response.content or "",
-                        tool_calls=response.tool_calls if response.tool_calls else None,
-                        llm_provider_response_id=response.response_id,
-                    )
-                    if response.content:
-                        self._emit_message(
-                            ChatResponseEmittedMessage(
-                                content=response.content,
-                                chat_id=chat.id,
-                                chat_thread_id=self._thread.id,
-                            )
-                        )
-
-                if not response.tool_calls:
-                    # This shouldn't happen with tool_choice="required", but handle it just in case
-                    logger.warning("Autonomous loop: no tool calls in iteration %d (unexpected with tool_choice=required)", self._autonomous_iteration)
-                    return
-
-                # Process tool calls and check for completion
-                for tool_call in response.tool_calls:
-                    result_str = self._auto_execute_tool(tool_call.tool_name, tool_call.tool_arguments)
-
-                    self._add_chat(
-                        role=ChatRole.TOOL,
-                        content=f"Tool '{tool_call.tool_name}' result: {result_str}",
-                        tool_call_id=tool_call.call_id,
-                    )
-
-                    if self._check_autonomous_completion(tool_call.tool_name):
-                        logger.debug("Autonomous run completed at iteration %d", self._autonomous_iteration)
-                        return
-
-            except Exception as e:
-                logger.exception("Error in autonomous loop: %s", e)
-                self._emit_message(ErrorEmittedMessage(error=str(e)))
-                return
-
-        logger.warning("Autonomous loop hit max iterations (%d) without finalization", max_iterations)
diff --git a/bluebox/agents/specialists/dom_specialist.py b/bluebox/agents/specialists/dom_specialist.py
new file mode 100644
index 00000000..7ab55868
--- /dev/null
+++ b/bluebox/agents/specialists/dom_specialist.py
@@ -0,0 +1,345 @@
+"""
+bluebox/agents/specialists/dom_specialist.py
+
+DOM specialist agent.
+
+Analyzes captured DOM snapshots to discover page structure, interactive elements,
+forms, tables, links, and navigation patterns. Used during the exploration phase
+to understand what the browser rendered and what UI elements are available.
+"""
+
+from __future__ import annotations
+
+from textwrap import dedent
+from typing import TYPE_CHECKING, Any, Callable
+
+from bluebox.agents.abstract_agent import AbstractAgent, AgentCard, agent_tool
+from bluebox.workspace import AgentWorkspace, LocalAgentWorkspace
+from bluebox.data_models.llms.interaction import (
+    Chat,
+    ChatThread,
+    EmittedMessage,
+)
+from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
+from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader
+from bluebox.utils.logger import get_logger
+
+if TYPE_CHECKING:
+    from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader
+
+logger = get_logger(name=__name__)
+
+
+class DOMSpecialist(AbstractAgent):
+    """
+    DOM specialist agent.
+
+    Analyzes captured DOM snapshots to discover page structure,
+    interactive elements, forms, and navigation patterns.
+    """
+
+    AGENT_CARD = AgentCard(
+        description=(
+            "Analyzes captured DOM snapshots (page structure, forms, inputs, buttons, "
+            "links, tables, headings). Useful for understanding what the browser rendered "
+            "and what interactive elements exist on each page."
+        ),
+    )
+    SYSTEM_PROMPT: str = dedent("""\
+        You are a DOM structure analyst specializing in understanding web page layouts from captured browser snapshots.
+
+        ## What You Analyze
+
+        - **Forms**: Login forms, search forms, checkout forms — with their inputs, actions, and methods
+        - **Elements**: Inputs, buttons, links, headings, meta tags, hidden inputs, clickable elements
+        - **Tables**: Data tables with headers and row counts
+        - **Script tags**: Server-side data blobs (__NEXT_DATA__, __NUXT__), inline JSON config, structured data (ld+json)
+
+        ## What to Ignore
+
+        - Internal framework nodes, shadow DOM internals
+        - Style/layout-only elements with no semantic meaning
+
+        ## How to Work
+
+        1. Start with `list_pages` to see all captured pages
+        2. Use `get_elements(element_type=...)` to scan for inputs, buttons, links, headings, meta_tags, hidden_inputs, or clickable elements
+        3. Use `get_forms` for forms with their child inputs
+        4. Use `get_tables` for data tables
+        5. Use `get_scripts` to find server-side data blobs and inline configuration
+        6. Use `get_snapshot_diff` to understand what changed between pages
+        7. Use `search_strings` to find specific content across snapshots
+
+    """)
+
+    AUTONOMOUS_SYSTEM_PROMPT: str = dedent("""\
+        You are a DOM structure analyst that autonomously maps out page structure from captured browser snapshots.
+
+        ## Your Mission
+
+        Analyze all captured DOM snapshots to produce a complete picture of:
+        - What pages were visited and in what order
+        - What forms exist and what they do (action URLs, input fields)
+        - What interactive elements are available (buttons, links, inputs)
+        - What data is displayed (tables, headings, text content)
+        - What tokens/keys are embedded in the page (CSRF, session IDs, API keys)
+        - What server-side data is rendered into the DOM (__NEXT_DATA__, inline JSON, ld+json)
+
+        ## Process
+
+        1. **Survey**: Use `list_pages` to see all captured pages
+        2. **Scan forms**: Use `get_forms` to find all forms with their inputs
+        3. **Scan elements**: Use `get_elements(element_type=...)` for each type:
+           - `inputs` — text fields, dropdowns, checkboxes, date pickers
+           - `buttons` — submit buttons, action buttons
+           - `links` — anchor links with href values
+           - `headings` — H1-H6 page structure
+           - `meta_tags` — CSRF tokens, API configs, verification keys
+           - `hidden_inputs` — CSRF tokens, session IDs, form tokens
+           - `clickable` — anything the browser marked as interactive
+        4. **Scan tables**: Use `get_tables` for data displays
+        5. **Scan scripts**: Use `get_scripts` to find __NEXT_DATA__, inline JSON, framework state blobs
+        6. **Check diffs**: Use `get_snapshot_diff` between consecutive pages to see what changed
+        7. **Finalize**: Call the appropriate finalize tool with your findings
+
+        ## Output Focus
+
+        Prioritize: forms and their endpoints, parameterizable inputs, action buttons, data tables,
+        embedded tokens/keys, and server-side data blobs. These are what matter for routine construction.
+    """)
+
+    ## Magic methods
+
+    def __init__(
+        self,
+        emit_message_callable: Callable[[EmittedMessage], None],
+        dom_data_loader: DOMDataLoader,
+        documentation_data_loader: DocumentationDataLoader | None = None,
+        persist_chat_callable: Callable[[Chat], Chat] | None = None,
+        persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None,
+        stream_chunk_callable: Callable[[str], None] | None = None,
+        llm_model: LLMModel = OpenAIModel.GPT_5_1,
+        chat_thread: ChatThread | None = None,
+        existing_chats: list[Chat] | None = None,
+        workspace: AgentWorkspace | None = None,
+    ) -> None:
+        self._dom_data_loader = dom_data_loader
+
+        super().__init__(
+            emit_message_callable=emit_message_callable,
+            workspace=workspace or LocalAgentWorkspace.from_directory_path("./agent_workspace/specialist"),
+            persist_chat_callable=persist_chat_callable,
+            persist_chat_thread_callable=persist_chat_thread_callable,
+            stream_chunk_callable=stream_chunk_callable,
+            llm_model=llm_model,
+            chat_thread=chat_thread,
+            existing_chats=existing_chats,
+            documentation_data_loader=documentation_data_loader,
+        )
+        logger.debug(
+            "DOMSpecialist initialized with %d snapshots",
+            self._dom_data_loader.stats.total_snapshots,
+        )
+
+    ## Abstract method implementations
+
+    def _get_system_prompt(self) -> str:
+        stats = self._dom_data_loader.stats
+        context = (
+            f"\n\n## DOM Data Context\n"
+            f"- Total Snapshots: {stats.total_snapshots}\n"
+            f"- Unique URLs: {stats.unique_urls}\n"
+            f"- Unique Titles: {stats.unique_titles}\n"
+            f"- Hosts: {', '.join(stats.hosts.keys())}\n"
+        )
+        return self.SYSTEM_PROMPT + context
+
+    def _get_autonomous_system_prompt(self) -> str:
+        stats = self._dom_data_loader.stats
+        context = (
+            f"\n\n## DOM Data Context\n"
+            f"- Total Snapshots: {stats.total_snapshots}\n"
+            f"- Unique URLs: {stats.unique_urls}\n"
+            f"- Unique Titles: {stats.unique_titles}\n"
+            f"- Hosts: {', '.join(stats.hosts.keys())}\n"
+        )
+
+        return (
+            self.AUTONOMOUS_SYSTEM_PROMPT
+            + context
+            + self._get_output_schema_prompt_section()
+            + self._get_urgency_notice()
+        )
+
+    def _get_autonomous_initial_message(self, task: str) -> str:
+        finalize_success = "finalize_with_output" if self.has_output_schema else "finalize_result"
+
+        return (
+            f"TASK: {task}\n\n"
+            f"Analyze the captured DOM snapshots to map out page structure, forms, "
+            f"inputs, buttons, links, tables, and navigation patterns. "
+            f"When confident, use {finalize_success} to report your findings."
+        )
+
+    ## Tool handlers
+
+    @agent_tool(token_optimized=True)
+    def _list_pages(self) -> dict[str, Any]:
+        """List all captured pages with their URLs, titles, and snapshot metadata."""
+        pages = self._dom_data_loader.list_pages()
+        return {
+            "total_pages": len(pages),
+            "pages": pages,
+        }
+
+    @agent_tool(token_optimized=True)
+    def _get_elements(self, element_type: str, snapshot_index: int | None = None) -> dict[str, Any]:
+        """
+        Get elements of a specific type from DOM snapshots.
+
+        A single tool that replaces individual per-type tools. Supports:
+        - 'inputs' — INPUT, SELECT, TEXTAREA fields with their attributes and values
+        - 'buttons' — BUTTON elements and INPUT type=submit/button
+        - 'links' — anchor links (<a>) with href values
+        - 'headings' — H1-H6 elements with their text content
+        - 'meta_tags' — META elements (CSRF tokens, API endpoints, OG tags, page config)
+        - 'hidden_inputs' — INPUT type=hidden (CSRF tokens, session IDs, form tokens)
+        - 'clickable' — all elements marked as clickable by the browser
+
+        Args:
+            element_type: One of 'inputs', 'buttons', 'links', 'headings', 'meta_tags', 'hidden_inputs', 'clickable'.
+            snapshot_index: If provided, only search this specific snapshot. Otherwise searches all.
+        """
+        try:
+            results = self._dom_data_loader.get_elements(element_type, snapshot_index)
+        except ValueError as e:
+            return {"error": str(e)}
+
+        total = sum(len(r["elements"]) for r in results)
+        return {
+            "element_type": element_type,
+            "total_elements": total,
+            "snapshots_with_elements": len(results),
+            "results": results,
+        }
+
+    @agent_tool(token_optimized=True)
+    def _get_forms(self, snapshot_index: int | None = None) -> dict[str, Any]:
+        """
+        Get all <form> elements with their action URL, method, and child inputs.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot. Otherwise searches all.
+        """
+        results = self._dom_data_loader.get_forms(snapshot_index)
+        total = sum(len(r["forms"]) for r in results)
+        return {
+            "total_forms": total,
+            "snapshots_with_forms": len(results),
+            "results": results,
+        }
+
+    @agent_tool(token_optimized=True)
+    def _get_tables(self, snapshot_index: int | None = None) -> dict[str, Any]:
+        """
+        Get all <table> elements with their headers and row counts.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot. Otherwise searches all.
+        """
+        results = self._dom_data_loader.get_tables(snapshot_index)
+        total = sum(len(r["tables"]) for r in results)
+        return {
+            "total_tables": total,
+            "snapshots_with_tables": len(results),
+            "results": results,
+        }
+
+    @agent_tool(token_optimized=True)
+    def _get_scripts(self, snapshot_index: int | None = None, max_inline_chars: int = 2000) -> dict[str, Any]:
+        """
+        Get all <script> elements with their attributes and inline content.
+
+        Finds framework data blobs (__NEXT_DATA__, __NUXT__), inline JSON config,
+        structured data (ld+json), and embedded configuration (GTM, analytics).
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot. Otherwise searches all.
+            max_inline_chars: Max characters for inline script content (default 2000).
+        """
+        results = self._dom_data_loader.get_scripts(snapshot_index, max_inline_chars)
+        total = sum(len(r["scripts"]) for r in results)
+        return {
+            "total_scripts": total,
+            "snapshots_with_scripts": len(results),
+            "results": results,
+        }
+
+    @agent_tool(token_optimized=True)
+    def _get_text_content(self, snapshot_index: int, max_chars: int = 5000) -> dict[str, Any]:
+        """
+        Get visible text content from a snapshot's string table.
+
+        Args:
+            snapshot_index: Zero-based snapshot index.
+            max_chars: Maximum characters to return (default 5000).
+        """
+        content = self._dom_data_loader.get_text_content(snapshot_index, max_chars)
+        if content is None:
+            return {"error": f"Snapshot index {snapshot_index} out of range"}
+        return {
+            "snapshot_index": snapshot_index,
+            "content": content,
+        }
+
+    @agent_tool(token_optimized=True)
+    def _search_strings(
+        self,
+        value: str,
+        snapshot_index: int | None = None,
+    ) -> dict[str, Any]:
+        """
+        Search for a string across all snapshot string tables.
+
+        Useful for finding specific content, attribute values, or text.
+
+        Args:
+            value: The string to search for (case-insensitive substring match).
+            snapshot_index: If provided, only search this specific snapshot.
+        """
+        results = self._dom_data_loader.search_strings(
+            value=value,
+            case_sensitive=False,
+            snapshot_index=snapshot_index,
+        )
+        return {
+            "query": value,
+            "total_matches": sum(r["match_count"] for r in results),
+            "snapshots_with_matches": len(results),
+            "results": results,
+        }
+
+    @agent_tool(token_optimized=True)
+    def _get_snapshot_diff(self, index_a: int, index_b: int) -> dict[str, Any]:
+        """
+        Compare two snapshots to see what strings were added or removed.
+
+        Useful for understanding what changed between page navigations.
+
+        Args:
+            index_a: Index of the first (earlier) snapshot.
+            index_b: Index of the second (later) snapshot.
+        """
+        diff = self._dom_data_loader.get_snapshot_diff(index_a, index_b)
+        if diff is None:
+            return {"error": f"Invalid snapshot indices: {index_a}, {index_b}"}
+        return diff
+
+    @agent_tool(token_optimized=True)
+    def _get_navigation_sequence(self) -> dict[str, Any]:
+        """Get the ordered sequence of page navigations from all snapshots."""
+        sequence = self._dom_data_loader.get_navigation_sequence()
+        return {
+            "total_navigations": len(sequence),
+            "sequence": sequence,
+        }
diff --git a/bluebox/agents/specialists/interaction_specialist.py b/bluebox/agents/specialists/interaction_specialist.py
index d48515b0..bf798581 100644
--- a/bluebox/agents/specialists/interaction_specialist.py
+++ b/bluebox/agents/specialists/interaction_specialist.py
@@ -3,8 +3,12 @@
 
 Interaction specialist agent.
 
-Analyzes UI interaction recordings to discover routine parameters
-(form inputs, typed values, dropdown selections, date pickers, etc.).
+Analyzes captured UI interaction events (clicks, typed values, selections,
+focus/blur) and optionally DOM snapshots to understand what the user did
+on the page — what they touched, typed, pressed, and navigated to.
+
+When DOM data is available, it can cross-reference interactions with page
+structure (forms, inputs, buttons) for richer context.
 """
 
 from __future__ import annotations
@@ -12,16 +16,21 @@
 from textwrap import dedent
 from typing import TYPE_CHECKING, Any, Callable
 
-from bluebox.agents.abstract_agent import AgentCard, agent_tool
-from bluebox.agents.specialists.abstract_specialist import AbstractSpecialist, RunMode
+from bluebox.agents.abstract_agent import (
+    AbstractAgent,
+    AgentCard,
+    ToolResultPersistMode,
+    agent_tool,
+)
+from bluebox.workspace import AgentWorkspace, LocalAgentWorkspace
 from bluebox.data_models.llms.interaction import (
     Chat,
     ChatThread,
     EmittedMessage,
 )
 from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
+from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader
 from bluebox.llms.data_loaders.interactions_data_loader import InteractionsDataLoader
-from bluebox.utils.llm_utils import token_optimized
 from bluebox.utils.logger import get_logger
 
 if TYPE_CHECKING:
@@ -30,64 +39,89 @@
 logger = get_logger(name=__name__)
 
 
-class InteractionSpecialist(AbstractSpecialist):
+class InteractionSpecialist(AbstractAgent):
     """
     Interaction specialist agent.
 
-    Analyzes recorded UI interactions to discover routine parameters.
+    Analyzes recorded UI interactions to understand what the user did:
+    what they clicked, typed, selected, and navigated to. Optionally
+    uses DOM snapshots for structural context (forms, inputs, buttons).
     """
 
     AGENT_CARD = AgentCard(
         description=(
-            "Analyzes user recorded UI interactions (form inputs, clicks, typed values). "
-            "Useful to obtain parameters of a routine by analyzing user interactions."
+            "Analyzes captured UI interaction events (clicks, typed values, "
+            "dropdown selections, form inputs) to understand what the user did "
+            "on the page. Optionally cross-references with DOM snapshots for "
+            "structural context (forms, inputs, buttons, links)."
         ),
     )
-
     SYSTEM_PROMPT: str = dedent("""\
-        You are a UI interaction analyst specializing in discovering routine parameters from recorded browser interactions.
+        You are a UI interaction analyst specializing in understanding what users
+        did on web pages from recorded browser interaction events.
 
-        ## What to Look For
+        ## What You Analyze
 
-        - **Form inputs**: Text fields, search boxes, email/password fields
+        ### User Interactions (primary focus)
+        - **Form inputs**: Text entered into fields, values selected from dropdowns
+        - **Clicks**: Buttons clicked, links followed, elements tapped
         - **Typed values**: Text entered by the user via keyboard
-        - **Dropdown selections**: Select elements, custom dropdowns
-        - **Date pickers**: Date/time inputs
-        - **Checkboxes and toggles**: Boolean parameters
+        - **Date pickers**: Date/time selections
+        - **Checkboxes and toggles**: Boolean selections
+
+        ### DOM Structure (when available, for context)
+        - **Forms**: What forms exist, their action URLs and fields
+        - **Inputs**: What input elements are on the page
+        - **Buttons**: What buttons/actions are available
+        - **Links**: Navigation options
+        - **Tables**: Data displays
+        - **Headings**: Page structure
 
         ## What to Ignore
 
-        - Navigational clicks, scroll events, hover effects, focus/blur without input
+        - Scroll events, hover effects, focus/blur without meaningful input
         - UI framework noise / internal framework events
+        - Style/layout-only elements with no semantic meaning
 
-        ## Parameter Requirements
-
-        Each discovered parameter needs:
-        - **name**: snake_case (e.g., `search_query`, `departure_date`)
-        - **type**: One of: string, integer, number, boolean, date, datetime, email, url, enum
-        - **description**: What the parameter represents
-        - **examples**: Observed values from the interactions
+        ## How to Work
 
+        1. Start with `get_interaction_summary` for an overview of all events
+        2. Use `get_form_inputs` to find what the user typed/selected
+        3. Use `get_unique_elements` to see which elements were interacted with
+        4. Use `search_interactions_by_type` to filter by click, input, change, etc.
+        5. Use `get_interaction_detail` for events needing closer inspection
+        6. If DOM data is available, use `list_pages`, `get_forms`, `get_inputs`
+           to cross-reference interactions with page structure
     """)
 
     AUTONOMOUS_SYSTEM_PROMPT: str = dedent("""\
-        You are a UI interaction analyst that autonomously discovers routine parameters from recorded browser interactions.
+        You are a UI interaction analyst that autonomously maps out what the user
+        did from recorded browser interaction events and DOM snapshots.
 
         ## Your Mission
 
-        Analyze recorded UI interactions to identify all parameterizable inputs.
+        Analyze all recorded interaction events to produce a complete picture of:
+        - What the user typed, selected, and clicked
+        - What form fields were filled in and with what values
+        - What buttons were pressed and what actions were triggered
+        - What the navigation flow looked like
+        - What the user was trying to accomplish (inferred intent)
 
         ## Process
 
         1. **Survey**: Use `get_interaction_summary` for an overview
-        2. **Focus on inputs**: Use `get_form_inputs` to find form input events
-        3. **Analyze elements**: Use `get_unique_elements` to see interacted elements
-        4. **Detail check**: Use `get_interaction_detail` for events needing closer inspection
-        5. **Finalize**: Call the appropriate finalize tool with your findings
-
-        ## Parameter Types
-
-        string, date (YYYY-MM-DD), datetime, integer, number, boolean, email, url, enum
+        2. **Form inputs**: Use `get_form_inputs` to find all typed/selected values
+        3. **Unique elements**: Use `get_unique_elements` to see all interacted elements
+        4. **Clicks**: Use `search_interactions_by_type(types=["click"])` to find button/link clicks
+        5. **Detail check**: Use `get_interaction_detail` for events needing closer look
+        6. If DOM tools available: `list_pages` + `get_forms` for structural context
+        7. **Finalize**: Call the appropriate finalize tool with your findings
+
+        ## Output Focus
+
+        Prioritize: user-typed values, form field selections, button clicks, and
+        navigation actions. These reveal what the user was trying to do and what
+        parameters a routine would need.
     """)
 
     ## Magic methods
@@ -96,31 +130,37 @@ def __init__(
         self,
         emit_message_callable: Callable[[EmittedMessage], None],
         interaction_data_loader: InteractionsDataLoader,
+        dom_data_loader: DOMDataLoader | None = None,
         documentation_data_loader: DocumentationDataLoader | None = None,
         persist_chat_callable: Callable[[Chat], Chat] | None = None,
         persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None,
         stream_chunk_callable: Callable[[str], None] | None = None,
-        llm_model: LLMModel = OpenAIModel.GPT_5_2,
-        run_mode: RunMode = RunMode.CONVERSATIONAL,
+        llm_model: LLMModel = OpenAIModel.GPT_5_1,
         chat_thread: ChatThread | None = None,
         existing_chats: list[Chat] | None = None,
+        workspace: AgentWorkspace | None = None,
     ) -> None:
         self._interaction_data_loader = interaction_data_loader
+        self._dom_data_loader = dom_data_loader
 
         super().__init__(
             emit_message_callable=emit_message_callable,
+            workspace=workspace or LocalAgentWorkspace.from_directory_path("./agent_workspace/specialist"),
             persist_chat_callable=persist_chat_callable,
             persist_chat_thread_callable=persist_chat_thread_callable,
             stream_chunk_callable=stream_chunk_callable,
             llm_model=llm_model,
-            run_mode=run_mode,
             chat_thread=chat_thread,
             existing_chats=existing_chats,
             documentation_data_loader=documentation_data_loader,
         )
+        dom_msg = ""
+        if dom_data_loader:
+            dom_msg = f", {dom_data_loader.stats.total_snapshots} DOM snapshots"
         logger.debug(
-            "InteractionSpecialist initialized with %d events",
-            len(interaction_data_loader.events),
+            "InteractionSpecialist initialized with %d interaction events%s",
+            interaction_data_loader.stats.total_events,
+            dom_msg,
         )
 
     ## Abstract method implementations
@@ -134,6 +174,19 @@ def _get_system_prompt(self) -> str:
             f"- Unique Elements: {stats.unique_elements}\n"
             f"- Events by Type: {stats.events_by_type}\n"
         )
+
+        if self._dom_data_loader:
+            d_stats = self._dom_data_loader.stats
+            context += (
+                f"\n## DOM Data Context\n"
+                f"- Total Snapshots: {d_stats.total_snapshots}\n"
+                f"- Unique URLs: {d_stats.unique_urls}\n"
+                f"- Unique Titles: {d_stats.unique_titles}\n"
+                f"- Hosts: {', '.join(d_stats.hosts.keys())}\n"
+            )
+        else:
+            context += "\n## DOM Data\n- Not available\n"
+
         return self.SYSTEM_PROMPT + context
 
     def _get_autonomous_system_prompt(self) -> str:
@@ -145,6 +198,17 @@ def _get_autonomous_system_prompt(self) -> str:
             f"- Unique Elements: {stats.unique_elements}\n"
         )
 
+        if self._dom_data_loader:
+            d_stats = self._dom_data_loader.stats
+            context += (
+                f"\n## DOM Data Context\n"
+                f"- Total Snapshots: {d_stats.total_snapshots}\n"
+                f"- Unique URLs: {d_stats.unique_urls}\n"
+                f"- Unique Titles: {d_stats.unique_titles}\n"
+            )
+        else:
+            context += "\n## DOM Data\n- Not available\n"
+
         return (
             self.AUTONOMOUS_SYSTEM_PROMPT
             + context
@@ -153,22 +217,27 @@ def _get_autonomous_system_prompt(self) -> str:
         )
 
     def _get_autonomous_initial_message(self, task: str) -> str:
-        # Use correct tool names based on whether output schema is set
         finalize_success = "finalize_with_output" if self.has_output_schema else "finalize_result"
 
+        dom_hint = ""
+        if self._dom_data_loader:
+            dom_hint = (
+                " DOM snapshots are also available — use DOM tools to cross-reference "
+                "interactions with page structure."
+            )
+
         return (
             f"TASK: {task}\n\n"
-            f"Analyze the recorded UI interactions to discover all parameterizable inputs. "
-            f"Focus on form inputs, typed values, dropdown selections, and date pickers. "
+            f"Analyze the recorded UI interactions to discover what the user did: "
+            f"what they typed, clicked, selected, and navigated to.{dom_hint} "
             f"When confident, use {finalize_success} to report your findings."
         )
 
-    ## Tool handlers
+    ## Interaction tool handlers (always available)
 
-    @agent_tool()
-    @token_optimized
+    @agent_tool(token_optimized=True)
     def _get_interaction_summary(self) -> dict[str, Any]:
-        """Get summary statistics of all recorded interactions."""
+        """Get summary statistics of all recorded UI interactions (clicks, inputs, changes)."""
         stats = self._interaction_data_loader.stats
         return {
             "total_events": stats.total_events,
@@ -177,12 +246,10 @@ def _get_interaction_summary(self) -> dict[str, Any]:
             "events_by_type": stats.events_by_type,
         }
 
-
-    @agent_tool()
-    @token_optimized
+    @agent_tool(token_optimized=True)
     def _search_interactions_by_type(self, types: list[str]) -> dict[str, Any]:
         """
-        Filter interactions by type (e.g., click, input, change, keydown, focus).
+        Filter UI interaction events by type (e.g., click, input, change, keydown, focus).
 
         Args:
             types: List of InteractionType values to filter by.
@@ -190,13 +257,13 @@ def _search_interactions_by_type(self, types: list[str]) -> dict[str, Any]:
         if not types:
             return {"error": "types list is required"}
 
-        events = self._interaction_data_loader.filter_by_type(types)
-        # Return summary to avoid overwhelming the LLM
+        loader = self._interaction_data_loader
+        events = loader.filter_by_type(types)
         results = []
         for event in events[:50]:
             el = event.element
             results.append({
-                "index": self._interaction_data_loader.events.index(event),
+                "index": loader.events.index(event),
                 "type": event.type.value,
                 "tag_name": el.tag_name,
                 "element_id": el.id,
@@ -212,9 +279,7 @@ def _search_interactions_by_type(self, types: list[str]) -> dict[str, Any]:
             "results": results,
         }
 
-
-    @agent_tool()
-    @token_optimized
+    @agent_tool(token_optimized=True)
     def _search_interactions_by_element(
         self,
         tag_name: str | None = None,
@@ -223,7 +288,7 @@ def _search_interactions_by_element(
         type_attr: str | None = None,
     ) -> dict[str, Any]:
         """
-        Filter interactions by element attributes (tag, id, class, type).
+        Filter UI interaction events by element attributes (tag, id, class, type).
 
         Args:
             tag_name: HTML tag name (e.g., input, select, button).
@@ -231,7 +296,8 @@ def _search_interactions_by_element(
             class_name: CSS class name (substring match).
             type_attr: Input type attribute (e.g., text, email, date).
         """
-        events = self._interaction_data_loader.filter_by_element(
+        loader = self._interaction_data_loader
+        events = loader.filter_by_element(
             tag_name=tag_name,
             element_id=element_id,
             class_name=class_name,
@@ -242,7 +308,7 @@ def _search_interactions_by_element(
         for event in events[:50]:
             el = event.element
             results.append({
-                "index": self._interaction_data_loader.events.index(event),
+                "index": loader.events.index(event),
                 "type": event.type.value,
                 "tag_name": el.tag_name,
                 "element_id": el.id,
@@ -259,40 +325,212 @@ def _search_interactions_by_element(
             "results": results,
         }
 
-
-    @agent_tool()
-    @token_optimized
+    @agent_tool(token_optimized=True)
     def _get_interaction_detail(self, index: int) -> dict[str, Any]:
         """
-        Get full details of a specific interaction event by index.
+        Get full details of a specific UI interaction event by index.
 
         Args:
             index: Zero-based index of the interaction event.
         """
-        detail = self._interaction_data_loader.get_event_detail(index)
+        loader = self._interaction_data_loader
+        detail = loader.get_event_detail(index)
         if detail is None:
-            return {"error": f"Event index {index} out of range (0-{len(self._interaction_data_loader.events) - 1})"}
+            return {"error": f"Event index {index} out of range (0-{len(loader.events) - 1})"}
 
         return detail
 
-
-    @agent_tool()
-    @token_optimized
+    @agent_tool(token_optimized=True)
     def _get_form_inputs(self) -> dict[str, Any]:
-        """Get all input/change events with their values and element info."""
+        """Get all user input/change interaction events with their values and element info."""
         inputs = self._interaction_data_loader.get_form_inputs()
         return {
             "total_inputs": len(inputs),
             "inputs": inputs[:100],
         }
 
-
-    @agent_tool()
-    @token_optimized
+    @agent_tool(token_optimized=True)
     def _get_unique_elements(self) -> dict[str, Any]:
-        """Get deduplicated elements with interaction counts and types."""
+        """Get deduplicated UI elements with interaction counts and types."""
         elements = self._interaction_data_loader.get_unique_elements()
         return {
             "total_unique_elements": len(elements),
             "elements": elements[:50],
         }
+
+    ## DOM tool handlers — only available when dom_data_loader is present
+
+    @agent_tool(
+        availability=lambda self: self._dom_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        token_optimized=True,
+    )
+    def _list_pages(self) -> dict[str, Any]:
+        """List all captured pages with their URLs, titles, and snapshot metadata."""
+        pages = self._dom_data_loader.list_pages()  # type: ignore[union-attr]
+        return {
+            "total_pages": len(pages),
+            "pages": pages,
+        }
+
+    @agent_tool(
+        availability=lambda self: self._dom_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        token_optimized=True,
+    )
+    def _get_inputs(self, snapshot_index: int | None = None) -> dict[str, Any]:
+        """
+        Get all input fields (INPUT, SELECT, TEXTAREA) from DOM snapshots.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot. Otherwise searches all.
+        """
+        results = self._dom_data_loader.get_inputs(snapshot_index)  # type: ignore[union-attr]
+        total = sum(len(r["elements"]) for r in results)
+        return {
+            "total_inputs": total,
+            "snapshots_with_inputs": len(results),
+            "results": results,
+        }
+
+    @agent_tool(
+        availability=lambda self: self._dom_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        token_optimized=True,
+    )
+    def _get_buttons(self, snapshot_index: int | None = None) -> dict[str, Any]:
+        """
+        Get all buttons from DOM snapshots (BUTTON elements and INPUT type=submit/button).
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot. Otherwise searches all.
+        """
+        results = self._dom_data_loader.get_buttons(snapshot_index)  # type: ignore[union-attr]
+        total = sum(len(r["elements"]) for r in results)
+        return {
+            "total_buttons": total,
+            "snapshots_with_buttons": len(results),
+            "results": results,
+        }
+
+    @agent_tool(
+        availability=lambda self: self._dom_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        token_optimized=True,
+    )
+    def _get_links(self, snapshot_index: int | None = None) -> dict[str, Any]:
+        """
+        Get all anchor links (<a>) from DOM snapshots with their href values.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot. Otherwise searches all.
+        """
+        results = self._dom_data_loader.get_links(snapshot_index)  # type: ignore[union-attr]
+        total = sum(len(r["elements"]) for r in results)
+        return {
+            "total_links": total,
+            "snapshots_with_links": len(results),
+            "results": results,
+        }
+
+    @agent_tool(
+        availability=lambda self: self._dom_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        token_optimized=True,
+    )
+    def _get_forms(self, snapshot_index: int | None = None) -> dict[str, Any]:
+        """
+        Get all <form> elements with their action URL, method, and child inputs.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot. Otherwise searches all.
+        """
+        results = self._dom_data_loader.get_forms(snapshot_index)  # type: ignore[union-attr]
+        total = sum(len(r["forms"]) for r in results)
+        return {
+            "total_forms": total,
+            "snapshots_with_forms": len(results),
+            "results": results,
+        }
+
+    @agent_tool(
+        availability=lambda self: self._dom_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        token_optimized=True,
+    )
+    def _get_tables(self, snapshot_index: int | None = None) -> dict[str, Any]:
+        """
+        Get all <table> elements with their headers and row counts.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot. Otherwise searches all.
+        """
+        results = self._dom_data_loader.get_tables(snapshot_index)  # type: ignore[union-attr]
+        total = sum(len(r["tables"]) for r in results)
+        return {
+            "total_tables": total,
+            "snapshots_with_tables": len(results),
+            "results": results,
+        }
+
+    @agent_tool(
+        availability=lambda self: self._dom_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        token_optimized=True,
+    )
+    def _get_headings(self, snapshot_index: int | None = None) -> dict[str, Any]:
+        """
+        Get all heading elements (H1-H6) with their text content.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot. Otherwise searches all.
+        """
+        results = self._dom_data_loader.get_headings(snapshot_index)  # type: ignore[union-attr]
+        total = sum(len(r["headings"]) for r in results)
+        return {
+            "total_headings": total,
+            "snapshots_with_headings": len(results),
+            "results": results,
+        }
+
+    @agent_tool(
+        availability=lambda self: self._dom_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        token_optimized=True,
+    )
+    def _get_navigation_sequence(self) -> dict[str, Any]:
+        """Get the ordered sequence of page navigations from all DOM snapshots."""
+        sequence = self._dom_data_loader.get_navigation_sequence()  # type: ignore[union-attr]
+        return {
+            "total_navigations": len(sequence),
+            "sequence": sequence,
+        }
+
+    @agent_tool(
+        availability=lambda self: self._dom_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        token_optimized=True,
+    )
+    def _search_strings(
+        self,
+        value: str,
+        snapshot_index: int | None = None,
+    ) -> dict[str, Any]:
+        """
+        Search for a string across all snapshot string tables.
+
+        Args:
+            value: The string to search for (case-insensitive substring match).
+            snapshot_index: If provided, only search this specific snapshot.
+        """
+        results = self._dom_data_loader.search_strings(  # type: ignore[union-attr]
+            value=value,
+            case_sensitive=False,
+            snapshot_index=snapshot_index,
+        )
+        return {
+            "query": value,
+            "total_matches": sum(r["match_count"] for r in results),
+            "snapshots_with_matches": len(results),
+            "results": results,
+        }
diff --git a/bluebox/agents/specialists/js_specialist.py b/bluebox/agents/specialists/js_specialist.py
index 62a85a0c..04b9561a 100644
--- a/bluebox/agents/specialists/js_specialist.py
+++ b/bluebox/agents/specialists/js_specialist.py
@@ -12,12 +12,11 @@
 from textwrap import dedent
 from typing import TYPE_CHECKING, Any, Callable
 
-from bluebox.agents.abstract_agent import AgentCard, agent_tool
-from bluebox.agents.specialists.abstract_specialist import AbstractSpecialist, RunMode
+from bluebox.agents.abstract_agent import AbstractAgent, AgentCard, agent_tool
+from bluebox.workspace import AgentWorkspace, LocalAgentWorkspace
 from bluebox.cdp.connection import (
-    cdp_new_tab,
-    create_cdp_helpers,
-    dispose_context,
+    cdp_close_tab_session,
+    cdp_open_new_tab_session,
 )
 from bluebox.data_models.dom import DOMSnapshotEvent
 from bluebox.data_models.llms.interaction import (
@@ -29,7 +28,6 @@
 from bluebox.llms.data_loaders.js_data_loader import JSDataLoader
 from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
 from bluebox.utils.js_utils import generate_js_evaluate_wrapper_js, validate_js
-from bluebox.utils.llm_utils import token_optimized
 from bluebox.utils.logger import get_logger
 
 if TYPE_CHECKING:
@@ -38,7 +36,7 @@
 logger = get_logger(name=__name__)
 
 
-class JSSpecialist(AbstractSpecialist):
+class JSSpecialist(AbstractAgent):
     """
     JavaScript specialist agent.
 
@@ -51,7 +49,6 @@ class JSSpecialist(AbstractSpecialist):
             "extraction, DOM scraping, and page state manipulation."
         ),
     )
-
     _BASE_CONTEXT: str = dedent("""\
         ## Context
 
@@ -130,12 +127,12 @@ def __init__(
         persist_chat_callable: Callable[[Chat], Chat] | None = None,
         persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None,
         stream_chunk_callable: Callable[[str], None] | None = None,
-        llm_model: LLMModel = OpenAIModel.GPT_5_2,
-        run_mode: RunMode = RunMode.CONVERSATIONAL,
+        llm_model: LLMModel = OpenAIModel.GPT_5_1,
         chat_thread: ChatThread | None = None,
         existing_chats: list[Chat] | None = None,
         remote_debugging_address: str | None = None,
         documentation_data_loader: DocumentationDataLoader | None = None,
+        workspace: AgentWorkspace | None = None,
     ) -> None:
         self._dom_snapshots = dom_snapshots or []
         self._remote_debugging_address = remote_debugging_address
@@ -144,11 +141,11 @@ def __init__(
 
         super().__init__(
             emit_message_callable=emit_message_callable,
+            workspace=workspace or LocalAgentWorkspace.from_directory_path("./agent_workspace/specialist"),
             persist_chat_callable=persist_chat_callable,
             persist_chat_thread_callable=persist_chat_thread_callable,
             stream_chunk_callable=stream_chunk_callable,
             llm_model=llm_model,
-            run_mode=run_mode,
             chat_thread=chat_thread,
             existing_chats=existing_chats,
             documentation_data_loader=documentation_data_loader,
@@ -221,8 +218,7 @@ def _get_autonomous_initial_message(self, task: str) -> str:
 
     ## Tools
 
-    @agent_tool()
-    @token_optimized
+    @agent_tool(token_optimized=True)
     def _validate_js_code(self, js_code: str) -> dict[str, Any]:
         """
         Dry-run validation of JavaScript code. Checks IIFE format and blocked patterns without submitting.
@@ -249,8 +245,7 @@ def _validate_js_code(self, js_code: str) -> dict[str, Any]:
         }
 
 
-    @agent_tool(availability=lambda self: bool(self._dom_snapshots))
-    @token_optimized
+    @agent_tool(availability=lambda self: bool(self._dom_snapshots), token_optimized=True)
     def _get_dom_snapshot(self, index: int = -1) -> dict[str, Any]:
         """
         Get a DOM snapshot. Returns the document structure and truncated strings table. Defaults to latest snapshot.
@@ -286,8 +281,7 @@ def _get_dom_snapshot(self, index: int = -1) -> dict[str, Any]:
         }
 
 
-    @agent_tool(availability=lambda self: self._network_data_loader is not None)
-    @token_optimized
+    @agent_tool(availability=lambda self: self._network_data_loader is not None, token_optimized=True)
     def _search_network_traffic(
         self,
         method: str | None = None,
@@ -343,8 +337,7 @@ def _search_network_traffic(
         }
 
 
-    @agent_tool(availability=lambda self: self._network_data_loader is not None)
-    @token_optimized
+    @agent_tool(availability=lambda self: self._network_data_loader is not None, token_optimized=True)
     def _get_network_entry(
         self,
         request_id: str,
@@ -390,8 +383,7 @@ def _get_network_entry(
         return result
 
 
-    @agent_tool(availability=lambda self: self._js_data_loader is not None)
-    @token_optimized
+    @agent_tool(availability=lambda self: self._js_data_loader is not None, token_optimized=True)
     def _search_js_files(self, terms: list[str], top_n: int = 10) -> dict[str, Any]:
         """
         Search captured JS files by keywords.
@@ -418,8 +410,7 @@ def _search_js_files(self, terms: list[str], top_n: int = 10) -> dict[str, Any]:
         }
 
 
-    @agent_tool(availability=lambda self: self._js_data_loader is not None)
-    @token_optimized
+    @agent_tool(availability=lambda self: self._js_data_loader is not None, token_optimized=True)
     def _search_js_files_regex(
         self,
         pattern: str,
@@ -464,8 +455,7 @@ def _search_js_files_regex(
         }
 
 
-    @agent_tool(availability=lambda self: self._js_data_loader is not None)
-    @token_optimized
+    @agent_tool(availability=lambda self: self._js_data_loader is not None, token_optimized=True)
     def _get_js_file_content(self, request_id: str, max_chars: int = 10_000) -> dict[str, Any]:
         """
         Get the content of a specific JS file by request_id.
@@ -494,8 +484,7 @@ def _get_js_file_content(self, request_id: str, max_chars: int = 10_000) -> dict
         }
 
 
-    @agent_tool(availability=lambda self: self._js_data_loader is not None)
-    @token_optimized
+    @agent_tool(availability=lambda self: self._js_data_loader is not None, token_optimized=True)
     def _list_js_files(self) -> dict[str, Any]:
         """
         List all captured JS files with URLs and sizes.
@@ -515,8 +504,7 @@ def _list_js_files(self) -> dict[str, Any]:
         }
 
 
-    @agent_tool(availability=lambda self: bool(self._remote_debugging_address))
-    @token_optimized
+    @agent_tool(availability=lambda self: bool(self._remote_debugging_address), token_optimized=True)
     def _execute_js_in_browser(
         self,
         url: str,
@@ -555,35 +543,30 @@ def _execute_js_in_browser(
         target_id = None
         browser_context_id = None
         browser_ws = None
+        send_cmd = None
+        recv_until = None
+        session_id = None
 
         try:
-            # Open new incognito tab
-            target_id, browser_context_id, browser_ws = cdp_new_tab(
-                self._remote_debugging_address,
+            session = cdp_open_new_tab_session(
+                remote_debugging_address=self._remote_debugging_address,
                 incognito=True,
                 url="about:blank",
+                enable_domains=("Page", "Runtime"),
+                timeout_seconds=10.0,
             )
-
-            send_cmd, _, recv_until = create_cdp_helpers(browser_ws)
-
-            # Attach to target with flattened session
-            attach_id = send_cmd(
-                "Target.attachToTarget",
-                {"targetId": target_id, "flatten": True},
-            )
-            attach_reply = recv_until(lambda m: m.get("id") == attach_id, deadline)
-            if "error" in attach_reply:
-                return {"error": f"Failed to attach: {attach_reply['error']}"}
-            session_id = attach_reply["result"]["sessionId"]
-
-            # Enable Page and Runtime domains
-            page_id = send_cmd("Page.enable", session_id=session_id)
-            recv_until(lambda m: m.get("id") == page_id, deadline)
-            runtime_id = send_cmd("Runtime.enable", session_id=session_id)
-            recv_until(lambda m: m.get("id") == runtime_id, deadline)
+            target_id = session.target_id
+            browser_context_id = session.browser_context_id
+            browser_ws = session.browser_ws
+            send_cmd = session.send_cmd
+            recv_until = session.recv_until
+            session_id = session.session_id
 
             # Navigate if URL provided
             if url:
+                assert send_cmd is not None
+                assert recv_until is not None
+                assert session_id is not None
                 nav_id = send_cmd(
                     "Page.navigate",
                     {"url": url},
@@ -604,6 +587,9 @@ def _execute_js_in_browser(
             wrapped_js = generate_js_evaluate_wrapper_js(js_code)
 
             # Execute via Runtime.evaluate
+            assert send_cmd is not None
+            assert recv_until is not None
+            assert session_id is not None
             eval_id = send_cmd(
                 "Runtime.evaluate",
                 {
@@ -646,19 +632,9 @@ def _execute_js_in_browser(
                     except Exception:
                         pass
             else:
-                if browser_ws:
-                    try:
-                        if target_id:
-                            send_cmd_cleanup, _, _ = create_cdp_helpers(browser_ws)
-                            send_cmd_cleanup("Target.closeTarget", {"targetId": target_id})
-                    except Exception:
-                        pass
-                    try:
-                        browser_ws.close()
-                    except Exception:
-                        pass
-                if browser_context_id and self._remote_debugging_address:
-                    try:
-                        dispose_context(browser_context_id=browser_context_id, remote_debugging_address=self._remote_debugging_address)
-                    except Exception:
-                        pass
+                cdp_close_tab_session(
+                    target_id=target_id,
+                    browser_context_id=browser_context_id,
+                    browser_ws=browser_ws,
+                    remote_debugging_address=self._remote_debugging_address,
+                )
diff --git a/bluebox/agents/specialists/network_specialist.py b/bluebox/agents/specialists/network_specialist.py
index 2b1fc207..7c5d55dd 100644
--- a/bluebox/agents/specialists/network_specialist.py
+++ b/bluebox/agents/specialists/network_specialist.py
@@ -7,17 +7,23 @@
 
 Contains:
 - NetworkSpecialist: Specialist for network traffic analysis
-- Uses: AbstractSpecialist base class for all agent plumbing
+- Uses: AbstractAgent base class for all agent plumbing
 """
 
 from __future__ import annotations
 
+import json as json_module
 from textwrap import dedent
 from typing import TYPE_CHECKING, Any, Callable
 from urllib.parse import urlparse, parse_qs
 
-from bluebox.agents.abstract_agent import AgentCard, agent_tool
-from bluebox.agents.specialists.abstract_specialist import AbstractSpecialist, RunMode
+from bluebox.agents.abstract_agent import (
+    AbstractAgent,
+    AgentCard,
+    ToolResultPersistMode,
+    agent_tool,
+)
+from bluebox.workspace import AgentWorkspace, LocalAgentWorkspace
 from bluebox.data_models.llms.interaction import (
     Chat,
     ChatThread,
@@ -25,8 +31,6 @@
 )
 from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
 from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
-from bluebox.utils.code_execution_sandbox import execute_python_sandboxed
-from bluebox.utils.llm_utils import token_optimized
 from bluebox.utils.logger import get_logger
 
 if TYPE_CHECKING:
@@ -35,11 +39,11 @@
 logger = get_logger(name=__name__)
 
 
-class NetworkSpecialist(AbstractSpecialist):
+class NetworkSpecialist(AbstractAgent):
     """
     Network specialist agent that helps analyze captured network traffic.
 
-    The agent uses AbstractSpecialist as its base and provides tools to search
+    The agent uses AbstractAgent as its base and provides tools to search
     and analyze network traffic data from JSONL captures.
     """
 
@@ -49,8 +53,7 @@ class NetworkSpecialist(AbstractSpecialist):
             "inspecting request/response data, and semantic search across captured traffic."
         ),
     )
-
-    SYSTEM_PROMPT: str = dedent("""
+    SYSTEM_PROMPT: str = dedent(f"""
         You are a network traffic analyst specializing in captured browser network data.
 
         ## Your Role
@@ -70,9 +73,11 @@ class NetworkSpecialist(AbstractSpecialist):
         - Be concise and direct
         - When you find a relevant entry, report its ID and URL
         - Always use search_responses_by_terms first when looking for specific data
+
+        {AbstractAgent.WORKSPACE_USAGE_SECTION}
     """).strip()
 
-    AUTONOMOUS_SYSTEM_PROMPT: str = dedent("""
+    AUTONOMOUS_SYSTEM_PROMPT: str = dedent(f"""
         You are a network traffic analyst that autonomously identifies API endpoints.
 
         ## Your Mission
@@ -91,6 +96,8 @@ class NetworkSpecialist(AbstractSpecialist):
         - Look for API/XHR calls (not HTML pages, JS files, or images)
         - Prefer endpoints with structured JSON responses
         - Consider multi-step flows: authentication, search, pagination
+
+        {AbstractAgent.WORKSPACE_USAGE_SECTION}
     """).strip()
 
     ## Magic methods
@@ -102,11 +109,11 @@ def __init__(
         persist_chat_callable: Callable[[Chat], Chat] | None = None,
         persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None,
         stream_chunk_callable: Callable[[str], None] | None = None,
-        llm_model: LLMModel = OpenAIModel.GPT_5_2,
-        run_mode: RunMode = RunMode.CONVERSATIONAL,
+        llm_model: LLMModel = OpenAIModel.GPT_5_1,
         chat_thread: ChatThread | None = None,
         existing_chats: list[Chat] | None = None,
         documentation_data_loader: DocumentationDataLoader | None = None,
+        workspace: AgentWorkspace | None = None,
     ) -> None:
         """
         Initialize the network specialist agent.
@@ -118,24 +125,26 @@ def __init__(
             persist_chat_thread_callable: Optional callback to persist ChatThread.
             stream_chunk_callable: Optional callback for streaming text chunks.
             llm_model: The LLM model to use for conversation.
-            run_mode: How the specialist will be run (conversational or autonomous).
             chat_thread: Existing ChatThread to continue, or None for new conversation.
             existing_chats: Existing Chat messages if loading from persistence.
             documentation_data_loader: Optional DocumentationDataLoader for docs/code search tools.
+            workspace: Optional workspace for file I/O.
         """
         self._network_data_loader = network_data_loader
 
         super().__init__(
             emit_message_callable=emit_message_callable,
+            workspace=workspace or LocalAgentWorkspace.from_directory_path("./agent_workspace/specialist"),
             persist_chat_callable=persist_chat_callable,
             persist_chat_thread_callable=persist_chat_thread_callable,
             stream_chunk_callable=stream_chunk_callable,
             llm_model=llm_model,
-            run_mode=run_mode,
             chat_thread=chat_thread,
             existing_chats=existing_chats,
             documentation_data_loader=documentation_data_loader,
+            allow_code_execution=True,
         )
+
         logger.debug(
             "NetworkSpecialist initialized with model: %s, chat_thread_id: %s, entries: %d",
             llm_model,
@@ -187,7 +196,13 @@ def _get_system_prompt(self) -> str:
         else:
             host_context = ""
 
-        return self.SYSTEM_PROMPT + stats_context + host_context + urls_context
+        return (
+            self.SYSTEM_PROMPT
+            + stats_context
+            + host_context
+            + urls_context
+            + self._generate_code_execution_prompt()
+        )
 
     def _get_autonomous_system_prompt(self) -> str:
         """Get system prompt for autonomous mode with traffic context."""
@@ -211,6 +226,7 @@ def _get_autonomous_system_prompt(self) -> str:
             self.AUTONOMOUS_SYSTEM_PROMPT
             + stats_context
             + urls_context
+            + self._generate_code_execution_prompt()
             + self._get_output_schema_prompt_section()
             + self._get_urgency_notice()
         )
@@ -232,10 +248,19 @@ def _get_autonomous_initial_message(self, task: str) -> str:
             f"use {finalize_fail} to report why."
         )
 
+    def _get_workspace_usage_prompt_section(self) -> str:
+        """Keep embedded workspace guidance and append dynamic mounted-file listing."""
+        mounted_section = self._get_mounted_inputs_prompt_section()
+        if not mounted_section:
+            return ""
+        return f"\n\n{mounted_section}"
+
     ## Tool handlers
 
-    @agent_tool()
-    @token_optimized
+    @agent_tool(
+        token_optimized=True,
+        persist=ToolResultPersistMode.OVERFLOW,
+    )
     def _search_responses_by_terms(self, terms: list[str]) -> dict[str, Any]:
         """
         Search RESPONSE bodies by a list of terms.
@@ -264,8 +289,10 @@ def _search_responses_by_terms(self, terms: list[str]) -> dict[str, Any]:
             "results": results,
         }
 
-    @agent_tool()
-    @token_optimized
+    @agent_tool(
+        token_optimized=True,
+        persist=ToolResultPersistMode.OVERFLOW,
+    )
     def _get_entry_detail(self, request_id: str) -> dict[str, Any]:
         """
         Get full details of a specific network entry by request_id.
@@ -306,8 +333,10 @@ def _get_entry_detail(self, request_id: str) -> dict[str, Any]:
             "response_key_structure": key_structure,
         }
 
-    @agent_tool()
-    @token_optimized
+    @agent_tool(
+        token_optimized=True,
+        persist=ToolResultPersistMode.OVERFLOW,
+    )
     def _get_response_body_schema(self, request_id: str) -> dict[str, Any]:
         """
         Get the schema of a network entry's JSON response body.
@@ -330,39 +359,38 @@ def _get_response_body_schema(self, request_id: str) -> dict[str, Any]:
             "key_structure": key_structure,
         }
 
-    @agent_tool()
-    @token_optimized
-    def _get_unique_urls(self) -> dict[str, Any]:
+    @agent_tool(
+        token_optimized=True,
+        persist=ToolResultPersistMode.OVERFLOW,
+        max_characters=2_500,
+    )
+    def _get_unique_urls(self, max_urls: int = 100) -> dict[str, Any]:
         """
-        Get all unique URLs from the captured network traffic.
+        Get unique URLs from the captured network traffic.
+
+        Returns a count-sorted subset to avoid oversized tool payloads.
 
-        Returns a sorted list of all unique URLs observed in the traffic.
+        Args:
+            max_urls: Maximum number of URLs to return (default 100, max 500).
         """
+        if max_urls <= 0:
+            return {"error": "max_urls must be > 0"}
+        max_urls = min(max_urls, 500)
+
         url_counts = self._network_data_loader.url_counts
+        sorted_items = sorted(url_counts.items(), key=lambda x: x[1], reverse=True)
+        limited_url_counts = dict(sorted_items[:max_urls])
         return {
             "total_unique_urls": len(url_counts),
-            "url_counts": url_counts,
+            "returned_unique_urls": len(limited_url_counts),
+            "omitted_unique_urls": max(0, len(url_counts) - len(limited_url_counts)),
+            "url_counts": limited_url_counts,
         }
 
-    @agent_tool()
-    def _execute_python(self, code: str) -> dict[str, Any]:
-        """
-        Execute Python code in a sandboxed environment to analyze network entries.
-
-        The variable `entries` is pre-loaded as a list of NetworkTransactionEvent dicts.
-        Each entry has: request_id, url, method, status, mime_type, request_headers,
-        response_headers, post_data, response_body. Use print() to output results.
-        Example: for e in entries[:5]: print(e['url'])
-
-        Args:
-            code: Python code to execute. `entries` is a list of network entry dicts.
-                `json` module is available. Use print() for output. Imports are disabled.
-        """
-        entries = [e.model_dump() for e in self._network_data_loader.entries]
-        return execute_python_sandboxed(code, extra_globals={"entries": entries})
-
-    @agent_tool()
-    @token_optimized
+    @agent_tool(
+        token_optimized=True,
+        persist=ToolResultPersistMode.OVERFLOW,
+    )
     def _search_requests_by_terms(
         self,
         terms: list[str],
@@ -405,7 +433,6 @@ def _search_requests_by_terms(
 
             # Search headers
             if "headers" in search_in:
-                import json as json_module
                 headers_str = json_module.dumps(entry.request_headers).lower()
                 for term in terms_lower:
                     count = headers_str.count(term)
@@ -417,7 +444,6 @@ def _search_requests_by_terms(
 
             # Search body
             if "body" in search_in and entry.post_data:
-                import json as json_module
                 if isinstance(entry.post_data, (dict, list)):
                     post_data_str = json_module.dumps(entry.post_data)
                 else:
@@ -453,8 +479,10 @@ def _search_requests_by_terms(
             "results": results[:20],  # Top 20
         }
 
-    @agent_tool()
-    @token_optimized
+    @agent_tool(
+        token_optimized=True,
+        persist=ToolResultPersistMode.OVERFLOW,
+    )
     def _search_response_bodies(
         self,
         value: str,
diff --git a/bluebox/agents/specialists/value_trace_resolver_specialist.py b/bluebox/agents/specialists/value_trace_resolver_specialist.py
index c9ec4841..237a14f6 100644
--- a/bluebox/agents/specialists/value_trace_resolver_specialist.py
+++ b/bluebox/agents/specialists/value_trace_resolver_specialist.py
@@ -7,16 +7,22 @@
 
 Contains:
 - ValueTraceResolverSpecialist: Specialist for tracing values across network, storage, and window property data
-- Uses: AbstractSpecialist base class for all agent plumbing
+- Uses: AbstractAgent base class for all agent plumbing
 """
 
 from __future__ import annotations
 
+import json
 import textwrap
 from typing import TYPE_CHECKING, Any, Callable
 
-from bluebox.agents.abstract_agent import AgentCard, agent_tool
-from bluebox.agents.specialists.abstract_specialist import AbstractSpecialist, RunMode
+from bluebox.agents.abstract_agent import (
+    AbstractAgent,
+    AgentCard,
+    ToolResultPersistMode,
+    agent_tool,
+)
+from bluebox.workspace import AgentWorkspace, LocalAgentWorkspace
 from bluebox.data_models.llms.interaction import (
     Chat,
     ChatThread,
@@ -26,8 +32,6 @@
 from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
 from bluebox.llms.data_loaders.storage_data_loader import StorageDataLoader
 from bluebox.llms.data_loaders.window_property_data_loader import WindowPropertyDataLoader
-from bluebox.utils.code_execution_sandbox import execute_python_sandboxed
-from bluebox.utils.llm_utils import token_optimized
 from bluebox.utils.logger import get_logger
 
 if TYPE_CHECKING:
@@ -36,7 +40,7 @@
 logger = get_logger(name=__name__)
 
 
-class ValueTraceResolverSpecialist(AbstractSpecialist):
+class ValueTraceResolverSpecialist(AbstractAgent):
     """
     Trace hound agent that traces where tokens/values originated from.
 
@@ -51,6 +55,7 @@ class ValueTraceResolverSpecialist(AbstractSpecialist):
             "browser storage, and window properties."
         ),
     )
+    _VALUE_PREVIEW_MAX_CHARS = 800
 
     SYSTEM_PROMPT: str = textwrap.dedent("""
         You are a token origin specialist that traces where values come from in web traffic.
@@ -116,13 +121,14 @@ def __init__(
         network_data_loader: NetworkDataLoader | None = None,
         storage_data_loader: StorageDataLoader | None = None,
         window_property_data_loader: WindowPropertyDataLoader | None = None,
+        enable_execute_python: bool = True,
         persist_chat_callable: Callable[[Chat], Chat] | None = None,
         persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None,
         stream_chunk_callable: Callable[[str], None] | None = None,
-        llm_model: LLMModel = OpenAIModel.GPT_5_2,
-        run_mode: RunMode = RunMode.CONVERSATIONAL,
+        llm_model: LLMModel = OpenAIModel.GPT_5_1,
         chat_thread: ChatThread | None = None,
         existing_chats: list[Chat] | None = None,
+        workspace: AgentWorkspace | None = None,
     ) -> None:
         """
         Initialize the trace hound agent.
@@ -136,7 +142,6 @@ def __init__(
             persist_chat_thread_callable: Optional callback to persist ChatThread.
             stream_chunk_callable: Optional callback for streaming text chunks.
             llm_model: The LLM model to use for conversation.
-            run_mode: How the specialist will be run (conversational or autonomous).
             chat_thread: Existing ChatThread to continue, or None for new conversation.
             existing_chats: Existing Chat messages if loading from persistence.
             documentation_data_loader: Optional DocumentationDataLoader for docs/code search tools.
@@ -148,20 +153,22 @@ def __init__(
 
         super().__init__(
             emit_message_callable=emit_message_callable,
+            workspace=workspace or LocalAgentWorkspace.from_directory_path("./agent_workspace/specialist"),
             persist_chat_callable=persist_chat_callable,
             persist_chat_thread_callable=persist_chat_thread_callable,
             stream_chunk_callable=stream_chunk_callable,
             llm_model=llm_model,
-            run_mode=run_mode,
             chat_thread=chat_thread,
             existing_chats=existing_chats,
             documentation_data_loader=documentation_data_loader,
+            allow_code_execution=enable_execute_python,
         )
 
         logger.debug(
-            "ValueTraceResolverSpecialist initialized with model: %s, chat_thread_id: %s",
+            "ValueTraceResolverSpecialist initialized with model: %s, chat_thread_id: %s, execute_python=%s",
             llm_model,
             self._thread.id,
+            "enabled" if enable_execute_python else "disabled",
         )
 
     ## Abstract method implementations
@@ -215,10 +222,54 @@ def _get_autonomous_initial_message(self, task: str) -> str:
             f"analyze the results, and call {finalize_success} with your findings."
         )
 
+    @classmethod
+    def _preview_value(cls, value: Any, max_chars: int | None = None) -> str | None:
+        """Return a compact string preview for potentially large values."""
+        if value is None:
+            return None
+        limit = max_chars or cls._VALUE_PREVIEW_MAX_CHARS
+        try:
+            if isinstance(value, str):
+                serialized = value
+            else:
+                serialized = json.dumps(value, ensure_ascii=False, default=str)
+        except Exception:
+            serialized = str(value)
+
+        if len(serialized) <= limit:
+            return serialized
+        return f"{serialized[:limit]}... (truncated, {len(serialized)} chars)"
+
+    def _compact_storage_entry(self, index: int, entry: Any) -> dict[str, Any]:
+        """Return a compact, token-safe view of a storage event."""
+        return {
+            "index": index,
+            "timestamp": entry.timestamp,
+            "type": entry.type,
+            "origin": entry.origin,
+            "source": entry.source,
+            "key": entry.key,
+            "triggered_by": entry.triggered_by,
+            "value_preview": self._preview_value(entry.value),
+            "old_value_preview": self._preview_value(entry.old_value),
+            "new_value_preview": self._preview_value(entry.new_value),
+            "added_count": len(entry.added or []),
+            "modified_count": len(entry.modified or []),
+            "removed_count": len(entry.removed or []),
+            "total_count": entry.total_count,
+        }
+
+    def _compact_window_change(self, change: dict[str, Any]) -> dict[str, Any]:
+        """Return a compact, token-safe view of a window property change."""
+        compact = {k: v for k, v in change.items() if k != "value"}
+        value = change.get("value")
+        compact["value_preview"] = self._preview_value(value)
+        compact["value_type"] = type(value).__name__ if value is not None else None
+        return compact
+
     ## Tool handlers
 
-    @agent_tool()
-    @token_optimized
+    @agent_tool(persist=ToolResultPersistMode.OVERFLOW, max_characters=2_500, token_optimized=True)
     def _search_everywhere(
         self,
         value: str,
@@ -273,7 +324,7 @@ def _search_everywhere(
             results["window_properties"] = {
                 "found": len(window_results) > 0,
                 "count": len(window_results),
-                "matches": window_results[:10],
+                "matches": [self._compact_window_change(row) for row in window_results[:10]],
             }
         else:
             results["window_properties"] = {"available": False}
@@ -293,8 +344,12 @@ def _search_everywhere(
 
         return results
 
-    @agent_tool(availability=lambda self: self._network_data_loader is not None)
-    @token_optimized
+    @agent_tool(
+        availability=lambda self: self._network_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        max_characters=2_500,
+        token_optimized=True,
+    )
     def _search_in_network(
         self,
         value: str,
@@ -325,8 +380,12 @@ def _search_in_network(
             "results": results[:20],
         }
 
-    @agent_tool(availability=lambda self: self._storage_data_loader is not None)
-    @token_optimized
+    @agent_tool(
+        availability=lambda self: self._storage_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        max_characters=2_500,
+        token_optimized=True,
+    )
     def _search_in_storage(
         self,
         value: str,
@@ -357,8 +416,12 @@ def _search_in_storage(
             "results": results[:20],
         }
 
-    @agent_tool(availability=lambda self: self._window_property_data_loader is not None)
-    @token_optimized
+    @agent_tool(
+        availability=lambda self: self._window_property_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        max_characters=2_500,
+        token_optimized=True,
+    )
     def _search_in_window_props(
         self,
         value: str,
@@ -386,11 +449,15 @@ def _search_in_window_props(
         return {
             "value_searched": value,
             "results_found": len(results),
-            "results": results[:20],
+            "results": [self._compact_window_change(row) for row in results[:20]],
         }
 
-    @agent_tool(availability=lambda self: self._network_data_loader is not None)
-    @token_optimized
+    @agent_tool(
+        availability=lambda self: self._network_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        max_characters=2_500,
+        token_optimized=True,
+    )
     def _get_network_entry(self, request_id: str) -> dict[str, Any]:
         """
         Get full details of a network entry by request_id. Returns method, URL, headers, request body, and response body.
@@ -426,8 +493,12 @@ def _get_network_entry(self, request_id: str) -> dict[str, Any]:
             "response_content": response_content,
         }
 
-    @agent_tool(availability=lambda self: self._storage_data_loader is not None)
-    @token_optimized
+    @agent_tool(
+        availability=lambda self: self._storage_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        max_characters=2_500,
+        token_optimized=True,
+    )
     def _get_storage_entry(self, index: int) -> dict[str, Any]:
         """
         Get full details of a storage entry by index. Returns the storage event with all its fields.
@@ -442,13 +513,14 @@ def _get_storage_entry(self, index: int) -> dict[str, Any]:
         if not entry:
             return {"error": f"Entry at index {index} not found"}
 
-        return {
-            "index": index,
-            "entry": entry.model_dump(),
-        }
+        return self._compact_storage_entry(index=index, entry=entry)
 
-    @agent_tool(availability=lambda self: self._window_property_data_loader is not None)
-    @token_optimized
+    @agent_tool(
+        availability=lambda self: self._window_property_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        max_characters=2_500,
+        token_optimized=True,
+    )
     def _get_window_prop_changes(
         self,
         path: str,
@@ -475,11 +547,15 @@ def _get_window_prop_changes(
             "path": path,
             "exact_match": exact,
             "changes_found": len(results),
-            "changes": results[:20],
+            "changes": [self._compact_window_change(change) for change in results[:20]],
         }
 
-    @agent_tool(availability=lambda self: self._storage_data_loader is not None)
-    @token_optimized
+    @agent_tool(
+        availability=lambda self: self._storage_data_loader is not None,
+        persist=ToolResultPersistMode.OVERFLOW,
+        max_characters=2_500,
+        token_optimized=True,
+    )
     def _get_storage_by_key(self, key: str) -> dict[str, Any]:
         """
         Get all storage entries for a specific key name.
@@ -502,49 +578,11 @@ def _get_storage_by_key(self, key: str) -> dict[str, Any]:
         return {
             "key": key,
             "entries_found": len(entries),
-            "entries": [e.model_dump() for e in entries[:20]],
+            "entries": [
+                self._compact_storage_entry(
+                    index=self._storage_data_loader.entries.index(e),
+                    entry=e,
+                )
+                for e in entries[:20]
+            ],
         }
-
-    @agent_tool()
-    def _execute_python(self, code: str) -> dict[str, Any]:
-        """
-        Execute Python code in a sandboxed environment to analyze data.
-
-        Pre-loaded variables:
-        - `network_entries`: list of NetworkTransactionEvent dicts (request_id, url,
-          method, status, request_headers, response_headers, post_data, response_body)
-        - `storage_entries`: list of StorageEvent dicts (type, origin, key, value, etc.)
-        - `window_prop_entries`: list of WindowPropertyEvent dicts (url, timestamp, changes)
-
-        Use print() to output results.
-        Example: for e in storage_entries: if e['key'] == 'token': print(e)
-
-        Args:
-            code: Python code to execute. The `json` module is available.
-                Use print() for output. Imports are disabled for security.
-        """
-        # Build extra globals with all available data stores
-        extra_globals: dict[str, Any] = {}
-
-        if self._network_data_loader:
-            extra_globals["network_entries"] = [
-                e.model_dump() for e in self._network_data_loader.entries
-            ]
-        else:
-            extra_globals["network_entries"] = []
-
-        if self._storage_data_loader:
-            extra_globals["storage_entries"] = [
-                e.model_dump() for e in self._storage_data_loader.entries
-            ]
-        else:
-            extra_globals["storage_entries"] = []
-
-        if self._window_property_data_loader:
-            extra_globals["window_prop_entries"] = [
-                e.model_dump() for e in self._window_property_data_loader.entries
-            ]
-        else:
-            extra_globals["window_prop_entries"] = []
-
-        return execute_python_sandboxed(code, extra_globals=extra_globals)
diff --git a/bluebox/agents/terminal_agent_base.py b/bluebox/agents/terminal_agent_base.py
deleted file mode 100644
index da5f9ec7..00000000
--- a/bluebox/agents/terminal_agent_base.py
+++ /dev/null
@@ -1,305 +0,0 @@
-"""
-bluebox/agents/terminal_agent_base.py
-
-Abstract base class for terminal-based agent chat interfaces.
-"""
-
-from abc import ABC, abstractmethod
-from typing import Any
-
-from prompt_toolkit import prompt as pt_prompt
-from prompt_toolkit.formatted_text import HTML
-from prompt_toolkit.history import InMemoryHistory
-from rich import box
-from rich.console import Console
-from rich.panel import Panel
-
-from bluebox.data_models.llms.interaction import (
-    ChatRole,
-    EmittedMessage,
-    ChatResponseEmittedMessage,
-    ErrorEmittedMessage,
-    ToolInvocationResultEmittedMessage,
-)
-from bluebox.utils.terminal_utils import (
-    SlashCommandCompleter,
-    SlashCommandLexer,
-    print_assistant_message,
-    print_error,
-    print_tool_call,
-    print_tool_result,
-)
-
-
-class CommandHistory(InMemoryHistory):
-    """History that only stores commands (strings starting with '/')."""
-
-    def store_string(self, string: str) -> None:
-        """Store only if it's a command (starts with '/')."""
-        if string.strip().startswith('/'):
-            super().store_string(string)
-
-
-class AbstractTerminalAgentChat(ABC):
-    """
-    Abstract base class for terminal-based agent chat interfaces.
-
-    This class provides the common infrastructure for interactive terminal chat
-    with AI agents, including:
-    - Message handling and streaming
-    - Interactive command loop with slash commands
-    - Standard commands (/quit, /reset, /help)
-    - Autonomous command handling
-
-    Subclasses must implement:
-    - _create_agent(): Create the specific agent instance
-    - get_slash_commands(): Return list of slash commands
-    - print_welcome(): Print agent-specific welcome screen
-    - handle_autonomous_command(): Handle autonomous command execution
-    - autonomous_command_name: Property returning the autonomous command name
-    """
-    
-    autonomous_command_name: str = "autonomous"
-    """
-    The autonomous command name (without leading slash).
-
-    Subclasses should override this with their specific command name.
-    Examples: 'discover', 'trace', 'search', 'autonomous'
-    """
-
-    def __init__(self, console: Console, agent_color: str) -> None:
-        """
-        Initialize the terminal chat interface.
-
-        Args:
-            console: Rich Console instance for output
-            agent_color: Rich color name for agent-specific styling (e.g., "cyan", "green")
-        """
-        self.console = console
-        self.agent_color = agent_color
-        self._streaming_started: bool = False
-        self._assistant_header_printed: bool = False
-        self._command_history = CommandHistory()
-        self._agent = self._create_agent()
-
-    @abstractmethod
-    def _create_agent(self) -> Any:
-        """
-        Create and return the agent instance.
-
-        The agent should be initialized with:
-        - emit_message_callable=self._handle_message
-        - stream_chunk_callable=self._handle_stream_chunk
-
-        Returns:
-            The agent instance
-        """
-
-    @abstractmethod
-    def get_slash_commands(self) -> list[tuple[str, str]]:
-        """
-        Return list of slash commands for this agent.
-
-        Returns:
-            List of (command, description) tuples, e.g.:
-            [
-                ("/discover", "Discover API endpoints"),
-                ("/reset", "Start a new conversation"),
-                ("/help", "Show help"),
-                ("/quit", "Exit"),
-            ]
-        """
-
-    @abstractmethod
-    def print_welcome(self) -> None:
-        """
-        Print the agent-specific welcome message with stats/info.
-
-        This is typically called once before starting the interactive loop.
-        """
-
-    @abstractmethod
-    def handle_autonomous_command(self, task: str) -> None:
-        """
-        Handle execution of the autonomous command.
-
-        Args:
-            task: The task description provided by the user
-
-        Example:
-            For /discover command: "train prices from NYC to Boston"
-            For /trace command: "eyJhbGciOiJIUzI1NiJ9"
-        """
-
-    def _handle_stream_chunk(self, chunk: str) -> None:
-        """
-        Handle streaming text chunks from the LLM.
-
-        Args:
-            chunk: A chunk of text to stream to the terminal
-        """
-        if not self._streaming_started:
-            self.console.print()
-            self.console.print(f"[bold {self.agent_color}]Assistant[/bold {self.agent_color}]")
-            self.console.print()
-            self._streaming_started = True
-            self._assistant_header_printed = True
-
-        print(chunk, end="", flush=True)
-
-    def _handle_message(self, message: EmittedMessage) -> None:
-        """
-        Handle messages emitted by the agent.
-
-        Args:
-            message: The emitted message from the agent
-        """
-        if isinstance(message, ChatResponseEmittedMessage):
-            if self._streaming_started:
-                # Streaming just finished
-                print()
-                print()
-                self._streaming_started = False
-            else:
-                # Non-streaming response
-                print_assistant_message(message.content, self.console)
-            self._assistant_header_printed = True
-
-        elif isinstance(message, ToolInvocationResultEmittedMessage):
-            # Reset streaming state if tool result arrives after streamed progress
-            if self._streaming_started:
-                print()
-                print()
-            elif not self._assistant_header_printed:
-                # Print "Assistant" header if no response or stream preceded this tool call
-                self.console.print()
-                self.console.print(f"[bold {self.agent_color}]Assistant[/bold {self.agent_color}]")
-            self._streaming_started = False
-            self._assistant_header_printed = True
-            # Show tool call and result
-            print_tool_call(message.tool_invocation, self.console)
-            print_tool_result(message.tool_invocation, message.tool_result, self.console)
-
-        elif isinstance(message, ErrorEmittedMessage):
-            print_error(message.error, self.console)
-
-    def _get_valid_commands(self) -> set[str]:
-        """
-        Get all valid slash commands for this agent.
-
-        Returns:
-            Set of valid command strings (e.g., {'/quit', '/exit', '/reset', ...})
-        """
-        valid_cmds = {
-            '/quit', '/exit', '/q',
-            '/reset',
-            '/help', '/h', '/?',
-            f'/{self.autonomous_command_name}'
-        }
-
-        # Add any additional commands from get_slash_commands()
-        for cmd, _ in self.get_slash_commands():
-            valid_cmds.add(cmd.lower())
-
-        return valid_cmds
-
-    def run(self) -> None:
-        """
-        Run the interactive chat loop.
-
-        Handles user input, slash commands, and delegates to the agent.
-        """
-        while True:
-            try:
-                user_input = pt_prompt(
-                    HTML(f"<b><ansi{self.agent_color}>You&gt;</ansi{self.agent_color}></b> "),
-                    completer=SlashCommandCompleter(self.get_slash_commands()),
-                    lexer=SlashCommandLexer(),
-                    complete_while_typing=True,
-                    history=self._command_history,
-                )
-
-                if not user_input.strip():
-                    continue
-
-                # Strip whitespace once for all processing
-                user_input = user_input.strip()
-
-                # Validate commands - if starts with '/', check it's valid
-                if user_input.startswith('/'):
-                    cmd_part = user_input.split()[0].lower()
-                    valid_commands = self._get_valid_commands()
-
-                    if cmd_part not in valid_commands:
-                        self.console.print()
-                        self.console.print(f"[bold red]Error:[/bold red] [red]Command does not exist: {cmd_part}[/red]")
-                        self.console.print(f"[dim]Type /help to see available commands[/dim]")
-                        self.console.print()
-                        continue
-
-                cmd = user_input.lower()
-
-                # Handle quit
-                if cmd in ("/quit", "/exit", "/q"):
-                    self.console.print()
-                    self.console.print(f"[bold {self.agent_color}]Goodbye![/bold {self.agent_color}]")
-                    self.console.print()
-                    break
-
-                # Handle reset
-                if cmd == "/reset":
-                    self._agent.reset()
-                    self.console.print()
-                    self.console.print("[yellow]Conversation reset[/yellow]")
-                    self.console.print()
-                    continue
-
-                # Handle help
-                if cmd in ("/help", "/h", "/?"):
-                    self._show_help()
-                    continue
-
-                # Handle autonomous command
-                if user_input.lower().startswith(f"/{self.autonomous_command_name}"):
-                    task = user_input[len(f"/{self.autonomous_command_name}"):].strip()
-                    if not task:
-                        self.console.print()
-                        self.console.print(f"[bold yellow]Usage:[/bold yellow] /{self.autonomous_command_name} <task>")
-                        self.console.print()
-                        continue
-                    self._assistant_header_printed = False
-                    self.handle_autonomous_command(task)
-                    continue
-
-                # Normal message - send to agent
-                self._assistant_header_printed = False
-                self._agent.process_new_message(user_input, ChatRole.USER)
-
-            except KeyboardInterrupt:
-                self.console.print()
-                self.console.print(f"[{self.agent_color}]Interrupted. Goodbye![/{self.agent_color}]")
-                self.console.print()
-                break
-
-            except EOFError:
-                self.console.print()
-                self.console.print(f"[{self.agent_color}]Goodbye![/{self.agent_color}]")
-                self.console.print()
-                break
-
-    def _show_help(self) -> None:
-        """Show help panel with available commands."""
-        # Build help text from slash commands
-        commands_text = "\n".join(
-            f"  [{self.agent_color}]{cmd}[/{self.agent_color}]  {desc}"
-            for cmd, desc in self.get_slash_commands()
-        )
-
-        self.console.print()
-        self.console.print(Panel(
-            f"[bold]Commands:[/bold]\n{commands_text}",
-            title=f"[bold {self.agent_color}]Help[/bold {self.agent_color}]",
-            border_style=self.agent_color,
-            box=box.ROUNDED,
-        ))
-        self.console.print()
diff --git a/bluebox/agents/workers/__init__.py b/bluebox/agents/workers/__init__.py
new file mode 100644
index 00000000..293f6825
--- /dev/null
+++ b/bluebox/agents/workers/__init__.py
@@ -0,0 +1,12 @@
+"""
+bluebox/agents/workers/__init__.py
+
+Worker agents for Phase 2: Experimentation.
+
+Workers execute experiments dispatched by an orchestrator. They have access
+to both live browser tools and recorded capture data for reference.
+"""
+
+from bluebox.agents.workers.experiment_worker import ExperimentWorker
+
+__all__ = ["ExperimentWorker"]
diff --git a/bluebox/agents/workers/experiment_worker.py b/bluebox/agents/workers/experiment_worker.py
new file mode 100644
index 00000000..2cfe7f25
--- /dev/null
+++ b/bluebox/agents/workers/experiment_worker.py
@@ -0,0 +1,829 @@
+"""
+bluebox/agents/workers/experiment_worker.py
+
+Worker agent for Phase 2: Experimentation.
+
+The ExperimentWorker executes experiments in a live browser while referencing
+recorded session data. It has two categories of tools:
+
+- **browser_*** tools — interact with the LIVE browser (navigate, eval JS, raw CDP, DOM)
+- **recorded lookup tools** — look up RECORDED data from a previous session
+
+The worker OWNS its browser tab lifecycle — it lazily creates a persistent
+incognito tab on the first browser tool call and keeps it alive across all
+subsequent tool calls. Call close() to clean up.
+
+The worker does NOT construct routines or decide strategy. It executes experiments
+dispatched by an orchestrator and reports structured findings via finalize tools.
+"""
+
+from __future__ import annotations
+
+import json
+import textwrap
+import time
+from typing import TYPE_CHECKING, Any, Callable
+
+from websocket import WebSocket
+
+from bluebox.agents.abstract_agent import (
+    AbstractAgent,
+    AgentCard,
+    ToolResultPersistMode,
+    agent_tool,
+)
+from bluebox.workspace import AgentWorkspace
+from bluebox.cdp.connection import (
+    cdp_close_tab_session,
+    cdp_open_new_tab_session,
+)
+from bluebox.data_models.llms.interaction import (
+    Chat,
+    ChatThread,
+    EmittedMessage,
+)
+from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
+from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader
+from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
+from bluebox.llms.data_loaders.storage_data_loader import StorageDataLoader
+from bluebox.llms.data_loaders.window_property_data_loader import WindowPropertyDataLoader
+from bluebox.utils.js_utils import generate_get_dom_js
+from bluebox.utils.logger import get_logger
+
+if TYPE_CHECKING:
+    from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader
+
+logger = get_logger(name=__name__)
+
+
+class ExperimentWorker(AbstractAgent):
+    """
+    Worker agent that executes experiments in a live browser while referencing
+    captured session data.
+
+    Two sources of truth:
+    - **recorded lookup tools**: recorded/stale reference data from a previous session
+    - **browser_*** tools: live/current reality in the browser
+
+    The worker OWNS its browser tab. It lazily creates a persistent incognito
+    tab on the first browser tool call and reuses it for all subsequent calls.
+    Call close() to tear down the tab when done.
+    """
+
+    AGENT_CARD = AgentCard(
+        description=(
+            "Executes experiments in a live browser while referencing captured session data. "
+            "Has browser tools (navigate, eval JS, CDP, DOM) and capture lookup tools."
+        ),
+    )
+    SYSTEM_PROMPT: str = textwrap.dedent("""\
+        You are an experiment worker agent with access to TWO sources of data:
+
+        ## Two Sources of Truth
+
+        1. **recorded lookup tools** — Recorded data from a PREVIOUS browser session.
+           This is stale/historical reference data. Use it to understand what the
+           website looked like, what requests were made, what tokens were used.
+           Think of it as "the recording."
+
+        2. **browser_* tools** — The LIVE browser tab you control right now.
+           This is current reality. Use it to navigate pages, execute JavaScript,
+           read current DOM state, and test hypotheses.
+
+        ## Your Role
+
+        You execute experiments dispatched by an orchestrator. Each experiment has
+        a specific hypothesis to test and expected output. Your job is to:
+        1. Look up relevant reference data from the capture (if needed)
+        2. Execute the experiment in the live browser
+        3. Report your findings via finalize tools
+
+        You do NOT decide strategy. You do NOT construct routines. You execute
+        experiments and report what you find.
+
+        ## Guidelines
+
+        - Always check capture data first to understand context before acting in the browser
+        - Use browser_eval_js for most browser interactions — it covers fetch, DOM reads,
+          clicks, typing, storage access, and more
+        - browser_get_dom is useful for understanding page structure before writing JS
+        - Use `execute_python` for focused analysis across captured data structures.
+          If a workspace is attached, mounted capture files are available under `raw/`
+          and can be read directly in Python when needed.
+        - Keep browser_eval_js expressions focused and concise
+        - Report exact values, not approximations
+    """)
+
+    AUTONOMOUS_SYSTEM_PROMPT: str = textwrap.dedent("""\
+        You are an autonomous experiment worker. Execute the given experiment,
+        gather findings, and finalize with structured output.
+
+        ## Two Sources of Truth
+
+        1. **recorded lookup tools** — Recorded/stale reference data from a previous session.
+        2. **browser_* tools** — Live browser tab you control right now.
+
+        ## Process
+
+        1. Read the experiment task carefully
+        2. Look up relevant capture data for context
+        3. Execute the experiment in the live browser
+        4. Collect results and call finalize_with_output (or finalize_with_failure if blocked)
+
+        ## Guidelines
+
+        - Do NOT navigate away from the current page unless the experiment requires it
+        - Use browser_eval_js as your Swiss army knife for all browser interactions
+        - You ALWAYS have `execute_python` available in this agent.
+        - Use `execute_python` when you need structured filtering/aggregation over capture data.
+          If workspace is present, `raw/` contains mounted capture files for direct reads.
+          In python, you can read files directly, e.g. `open("raw/<file>", encoding="utf-8").read()`.
+        - Report exact values and observations, not guesses
+    """)
+
+    # -----------------------------------------------------------------------
+    # Constructor
+    # -----------------------------------------------------------------------
+
+    def __init__(
+        self,
+        emit_message_callable: Callable[[EmittedMessage], None],
+        # Browser — worker owns its own tab lifecycle
+        remote_debugging_address: str | None = None,
+        # Capture data loaders — each optional, gating respective capture tools
+        network_data_loader: NetworkDataLoader | None = None,
+        storage_data_loader: StorageDataLoader | None = None,
+        dom_data_loader: DOMDataLoader | None = None,
+        window_property_data_loader: WindowPropertyDataLoader | None = None,
+        # Standard specialist args
+        persist_chat_callable: Callable[[Chat], Chat] | None = None,
+        persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None,
+        stream_chunk_callable: Callable[[str], None] | None = None,
+        llm_model: LLMModel = OpenAIModel.GPT_5_1,
+        chat_thread: ChatThread | None = None,
+        existing_chats: list[Chat] | None = None,
+        documentation_data_loader: DocumentationDataLoader | None = None,
+        workspace: AgentWorkspace | None = None,
+    ) -> None:
+        # Browser connection — lazy init on first browser tool call
+        self._remote_debugging_address = remote_debugging_address
+        self._browser_ws: WebSocket | None = None
+        self._browser_target_id: str | None = None
+        self._browser_context_id: str | None = None
+        self._browser_session_id: str | None = None
+        self._browser_send_cmd: Callable[..., int] | None = None
+        self._browser_recv_until: Callable[..., dict[str, Any]] | None = None
+
+        # Capture data loaders
+        self._network_data_loader = network_data_loader
+        self._storage_data_loader = storage_data_loader
+        self._dom_data_loader = dom_data_loader
+        self._window_property_data_loader = window_property_data_loader
+
+        super().__init__(
+            emit_message_callable=emit_message_callable,
+            workspace=workspace,
+            persist_chat_callable=persist_chat_callable,
+            persist_chat_thread_callable=persist_chat_thread_callable,
+            stream_chunk_callable=stream_chunk_callable,
+            llm_model=llm_model,
+            chat_thread=chat_thread,
+            existing_chats=existing_chats,
+            documentation_data_loader=documentation_data_loader,
+            allow_code_execution=True,
+        )
+
+        logger.debug(
+            "ExperimentWorker initialized: browser=%s, network=%s, storage=%s, dom=%s, window_props=%s",
+            "yes" if self._remote_debugging_address else "no",
+            "yes" if self._network_data_loader else "no",
+            "yes" if self._storage_data_loader else "no",
+            "yes" if self._dom_data_loader else "no",
+            "yes" if self._window_property_data_loader else "no",
+        )
+
+    # -----------------------------------------------------------------------
+    # Properties
+    # -----------------------------------------------------------------------
+
+    @property
+    def _has_browser(self) -> bool:
+        """True when a remote debugging address is configured (tab will be created on first use)."""
+        return self._remote_debugging_address is not None
+
+    @property
+    def _browser_connected(self) -> bool:
+        """True when the persistent browser tab is already connected."""
+        return self._browser_session_id is not None and self._browser_ws is not None
+
+    # -----------------------------------------------------------------------
+    # Browser tab lifecycle
+    # -----------------------------------------------------------------------
+
+    def _ensure_browser(self) -> None:
+        """
+        Lazily create a persistent incognito browser tab.
+
+        Called automatically by browser tools on first use. Subsequent calls
+        are a no-op if the tab is already connected.
+
+        Raises:
+            RuntimeError: If no remote_debugging_address is configured or
+                          if tab creation fails.
+        """
+        if self._browser_connected:
+            return
+
+        if not self._remote_debugging_address:
+            raise RuntimeError("No remote_debugging_address configured — cannot create browser tab")
+
+        logger.info("Creating persistent browser tab at %s", self._remote_debugging_address)
+
+        session = cdp_open_new_tab_session(
+            remote_debugging_address=self._remote_debugging_address,
+            incognito=True,
+            url="about:blank",
+            enable_domains=("Page", "Runtime"),
+            timeout_seconds=10.0,
+        )
+
+        # Store persistent connection state
+        self._browser_ws = session.browser_ws
+        self._browser_target_id = session.target_id
+        self._browser_context_id = session.browser_context_id
+        self._browser_session_id = session.session_id
+        self._browser_send_cmd = session.send_cmd
+        self._browser_recv_until = session.recv_until
+
+        logger.info(
+            "Browser tab ready: target_id=%s, session_id=%s",
+            session.target_id, session.session_id,
+        )
+
+    def close(self) -> None:
+        """
+        Tear down the persistent browser tab and clean up resources.
+
+        Safe to call multiple times. Safe to call even if no tab was created.
+        """
+        cdp_close_tab_session(
+            target_id=self._browser_target_id,
+            browser_context_id=self._browser_context_id,
+            browser_ws=self._browser_ws,
+            remote_debugging_address=self._remote_debugging_address,
+        )
+
+        # Reset state
+        self._browser_ws = None
+        self._browser_target_id = None
+        self._browser_context_id = None
+        self._browser_session_id = None
+        self._browser_send_cmd = None
+        self._browser_recv_until = None
+
+        logger.info("Browser tab closed")
+
+    # -----------------------------------------------------------------------
+    # Abstract method implementations
+    # -----------------------------------------------------------------------
+
+    def _get_system_prompt(self) -> str:
+        parts = [self.SYSTEM_PROMPT]
+        if self.has_workspace:
+            try:
+                summary = self._require_workspace().generate_summary()
+                parts.append(f"\n\n## Workspace Summary\n{summary}")
+            except Exception as e:
+                logger.warning("Failed to generate worker workspace summary: %s", e)
+        parts.append(self._get_data_context_section())
+        parts.append(self._generate_code_execution_prompt())
+        return "".join(parts)
+
+    def _get_autonomous_system_prompt(self) -> str:
+        parts = [self.AUTONOMOUS_SYSTEM_PROMPT]
+        if self.has_workspace:
+            try:
+                summary = self._require_workspace().generate_summary()
+                parts.append(f"\n\n## Workspace Summary\n{summary}")
+            except Exception as e:
+                logger.warning("Failed to generate worker workspace summary: %s", e)
+        parts.append(self._get_data_context_section())
+        parts.append(self._generate_code_execution_prompt())
+        parts.append(self._get_output_schema_prompt_section())
+        parts.append(self._get_urgency_notice())
+        return "".join(parts)
+
+    def _get_autonomous_initial_message(self, task: str) -> str:
+        finalize_success = "finalize_with_output" if self.has_output_schema else "finalize_result"
+        return (
+            f"EXPERIMENT: {task}\n\n"
+            f"Execute this experiment. Use recorded lookup tools for reference data and "
+            f"browser_* tools for live interaction. When done, call {finalize_success} "
+            f"with your findings.\n\n"
+            "You have `execute_python` access in this run. If workspace files are mounted, "
+            "inspect them under `raw/` using Python file I/O as needed."
+        )
+
+    def _get_data_context_section(self) -> str:
+        """Build a context section summarizing available data sources."""
+        parts: list[str] = ["\n\n## Available Data Sources\n"]
+
+        # Browser
+        if self._has_browser:
+            if self._browser_connected:
+                parts.append("- **Browser**: Connected (persistent tab active)\n")
+            else:
+                parts.append("- **Browser**: Available (tab will be created on first use)\n")
+        else:
+            parts.append("- **Browser**: Not configured (browser_* tools unavailable)\n")
+
+        # Network capture
+        if self._network_data_loader:
+            stats = self._network_data_loader.stats
+            parts.append(
+                f"- **Network capture**: {stats.total_requests} requests, "
+                f"{stats.unique_urls} unique URLs\n"
+            )
+        else:
+            parts.append("- **Network capture**: Not available\n")
+
+        # Storage capture
+        if self._storage_data_loader:
+            stats = self._storage_data_loader.stats
+            parts.append(
+                f"- **Storage capture**: {stats.total_events} events "
+                f"(cookies: {stats.cookie_events}, localStorage: {stats.local_storage_events})\n"
+            )
+        else:
+            parts.append("- **Storage capture**: Not available\n")
+
+        # DOM capture
+        if self._dom_data_loader:
+            stats = self._dom_data_loader.stats
+            parts.append(
+                f"- **DOM capture**: {stats.total_snapshots} snapshots, "
+                f"{stats.unique_urls} unique URLs\n"
+            )
+        else:
+            parts.append("- **DOM capture**: Not available\n")
+
+        # Window properties
+        if self._window_property_data_loader:
+            stats = self._window_property_data_loader.stats
+            parts.append(
+                f"- **Window properties**: {stats.total_events} events, "
+                f"{stats.unique_property_paths} unique paths\n"
+            )
+        else:
+            parts.append("- **Window properties**: Not available\n")
+
+        return "".join(parts)
+
+    # ===================================================================
+    # BROWSER TOOLS — gated by _has_browser (remote_debugging_address set)
+    # ===================================================================
+
+    def _browser_execute(
+        self,
+        fn: Callable[[Callable[..., int], Callable[..., dict[str, Any]], float], dict[str, Any]],
+        timeout_seconds: float = 10.0,
+        label: str = "browser operation",
+    ) -> dict[str, Any]:
+        """Run *fn* inside the ensure-browser / timeout / error envelope.
+
+        ``fn`` receives ``(send_cmd, recv_until, deadline)`` and returns a result dict.
+        The wrapper handles lazy tab creation, timeout, and generic error handling.
+        """
+        try:
+            self._ensure_browser()
+        except RuntimeError as e:
+            return {"error": str(e)}
+
+        assert self._browser_send_cmd is not None
+        assert self._browser_recv_until is not None
+
+        deadline = time.time() + timeout_seconds
+        try:
+            return fn(self._browser_send_cmd, self._browser_recv_until, deadline)
+        except TimeoutError:
+            return {"error": f"{label} timed out after {timeout_seconds}s"}
+        except Exception as e:
+            logger.error("%s failed: %s", label, e)
+            return {"error": f"{label} failed: {e}"}
+
+    @agent_tool(availability=lambda self: self._has_browser, token_optimized=True)
+    def _browser_navigate(self, url: str, timeout_seconds: float = 15.0) -> dict[str, Any]:
+        """
+        Navigate the browser tab to a URL and wait for page load.
+
+        Args:
+            url: The URL to navigate to.
+            timeout_seconds: Max time to wait for page load (default 15s).
+        """
+        def _run(send_cmd: Callable[..., int], recv_until: Callable[..., dict[str, Any]], deadline: float) -> dict[str, Any]:
+            nav_id = send_cmd("Page.navigate", {"url": url}, session_id=self._browser_session_id)
+            nav_reply = recv_until(lambda m: m.get("id") == nav_id, deadline)
+            if "error" in nav_reply:
+                return {"error": f"Navigation failed: {nav_reply['error']}"}
+            recv_until(lambda m: m.get("method") == "Page.loadEventFired", deadline)
+            return {"status": "ok", "url": url}
+
+        return self._browser_execute(_run, timeout_seconds=timeout_seconds, label=f"Navigation to {url}")
+
+    @agent_tool(availability=lambda self: self._has_browser, token_optimized=True)
+    def _browser_eval_js(
+        self,
+        expression: str,
+        timeout_seconds: float = 30.0,
+    ) -> dict[str, Any]:
+        """
+        Execute JavaScript in the browser page context and return the result.
+
+        This is your Swiss army knife — use it for:
+        - Reading DOM state (document.querySelector, document.cookie, etc.)
+        - Calling fetch() to test API endpoints
+        - Reading/writing localStorage, sessionStorage
+        - Clicking elements, filling inputs, submitting forms
+        - Any browser-side computation
+
+        The expression should return a value (use an IIFE if needed).
+        Results are truncated at 10K characters.
+
+        Args:
+            expression: JavaScript expression to evaluate. Use an IIFE for multi-statement code.
+            timeout_seconds: Max execution time (default 30s).
+        """
+        def _run(send_cmd: Callable[..., int], recv_until: Callable[..., dict[str, Any]], deadline: float) -> dict[str, Any]:
+            eval_id = send_cmd(
+                "Runtime.evaluate",
+                {"expression": expression, "returnByValue": True, "awaitPromise": True},
+                session_id=self._browser_session_id,
+            )
+            eval_reply = recv_until(lambda m: m.get("id") == eval_id, deadline)
+
+            if "error" in eval_reply:
+                return {"error": f"Runtime.evaluate error: {eval_reply['error']}"}
+
+            result_obj = eval_reply.get("result", {}).get("result", {})
+
+            exception_details = eval_reply.get("result", {}).get("exceptionDetails")
+            if exception_details:
+                return {
+                    "error": "JavaScript exception",
+                    "exception": exception_details.get("text", str(exception_details)),
+                }
+
+            value = result_obj.get("value")
+            result_str = json.dumps(value) if not isinstance(value, str) else value
+            if len(result_str) > 10_000:
+                return {"result": result_str[:10_000] + "... [TRUNCATED at 10K chars]", "truncated": True}
+            return {"result": value}
+
+        return self._browser_execute(_run, timeout_seconds=timeout_seconds, label="JS execution")
+
+    @agent_tool(
+        availability=lambda self: self._has_browser,
+        token_optimized=True,
+        persist=ToolResultPersistMode.OVERFLOW,
+        max_characters=10_000,
+    )
+    def _browser_cdp_command(
+        self,
+        method: str,
+        params: dict[str, Any] | None = None,
+        timeout_seconds: float = 10.0,
+    ) -> dict[str, Any]:
+        """
+        Send a raw CDP command to the browser. Escape hatch for anything not
+        covered by the other browser tools.
+
+        Args:
+            method: CDP method name (e.g. "Network.getCookies", "DOM.getDocument").
+            params: Optional parameters dict for the CDP method.
+            timeout_seconds: Max time to wait for response (default 10s).
+        """
+        def _run(send_cmd: Callable[..., int], recv_until: Callable[..., dict[str, Any]], deadline: float) -> dict[str, Any]:
+            cmd_id = send_cmd(method, params or {}, session_id=self._browser_session_id)
+            reply = recv_until(lambda m: m.get("id") == cmd_id, deadline)
+
+            if "error" in reply:
+                return {"error": f"CDP error: {reply['error']}"}
+
+            return {"result": reply.get("result", {})}
+
+        return self._browser_execute(_run, timeout_seconds=timeout_seconds, label=f"CDP command {method}")
+
+    @agent_tool(availability=lambda self: self._has_browser, token_optimized=True)
+    def _browser_get_dom(
+        self,
+        selector: str | None = None,
+        max_depth: int = 5,
+        include_tags: list[str] | None = None,
+    ) -> dict[str, Any]:
+        """
+        Get a filtered view of the current page DOM.
+
+        Returns a JSON tree of DOM nodes with key attributes (id, class, name,
+        type, href, etc.). Useful for understanding page structure before
+        writing JS code.
+
+        Args:
+            selector: CSS selector to scope the view (default: entire document).
+            max_depth: Maximum DOM tree depth to walk (default 5).
+            include_tags: Only include these tag names (default: all tags).
+                          Common values: ["form", "input", "button", "a", "select", "table"].
+        """
+        js = generate_get_dom_js(selector=selector, max_depth=max_depth, include_tags=include_tags)
+
+        def _run(send_cmd: Callable[..., int], recv_until: Callable[..., dict[str, Any]], deadline: float) -> dict[str, Any]:
+            eval_id = send_cmd(
+                "Runtime.evaluate",
+                {"expression": js, "returnByValue": True, "awaitPromise": False},
+                session_id=self._browser_session_id,
+            )
+            eval_reply = recv_until(lambda m: m.get("id") == eval_id, deadline)
+
+            if "error" in eval_reply:
+                return {"error": f"DOM query failed: {eval_reply['error']}"}
+
+            result_obj = eval_reply.get("result", {}).get("result", {})
+            raw_value = result_obj.get("value", "")
+            try:
+                dom_tree = json.loads(raw_value) if isinstance(raw_value, str) else raw_value
+            except json.JSONDecodeError:
+                dom_tree = raw_value
+            return {"dom": dom_tree, "selector": selector, "max_depth": max_depth}
+
+        return self._browser_execute(_run, timeout_seconds=10.0, label="DOM query")
+
+    # ===================================================================
+    # RECORDED LOOKUP TOOLS — gated by respective data loaders
+    # ===================================================================
+
+    @agent_tool(availability=lambda self: self._network_data_loader is not None, token_optimized=True)
+    def _search_recorded_transactions(
+        self,
+        query: str,
+        top_n: int = 15,
+    ) -> dict[str, Any]:
+        """
+        Search recorded network traffic by keyword terms.
+
+        Searches across URLs, request/response headers, and response bodies.
+        Returns ranked results with request IDs for drill-down.
+
+        Args:
+            query: Space-separated search terms (e.g. "search api flights").
+            top_n: Max results to return (default 15).
+        """
+        assert self._network_data_loader is not None
+
+        terms = query.split()
+        if not terms:
+            return {"error": "query is required"}
+
+        results = self._network_data_loader.search_entries_by_terms(
+            terms=terms, top_n=top_n,
+        )
+        return {
+            "query": query,
+            "count": len(results),
+            "results": results,
+        }
+
+    @agent_tool(availability=lambda self: self._network_data_loader is not None, token_optimized=True)
+    def _get_recorded_transaction(self, request_id: str) -> dict[str, Any]:
+        """
+        Get the full recorded request/response for a specific transaction.
+
+        Returns URL, method, status, request headers, request body,
+        response headers, and response body.
+
+        Args:
+            request_id: The request ID from search results or exploration.
+        """
+        assert self._network_data_loader is not None
+
+        entry = self._network_data_loader.get_entry(request_id)
+        if entry is None:
+            return {"error": f"No transaction found for request_id: {request_id}"}
+
+        return {
+            "request_id": entry.request_id,
+            "url": entry.url,
+            "method": entry.method,
+            "status_code": entry.status,
+            "request_headers": entry.request_headers,
+            "request_body": str(entry.post_data)[:5000] if entry.post_data else None,
+            "response_headers": entry.response_headers,
+            "response_body": entry.response_body[:5000] if entry.response_body else None,
+            "response_body_truncated": bool(entry.response_body and len(entry.response_body) > 5000),
+        }
+
+    @agent_tool(availability=lambda self: self._storage_data_loader is not None, token_optimized=True)
+    def _search_recorded_storage(
+        self,
+        query: str,
+        case_sensitive: bool = False,
+    ) -> dict[str, Any]:
+        """
+        Search recorded browser storage events (cookies, localStorage, sessionStorage).
+
+        Searches across storage keys and values.
+
+        Args:
+            query: Value to search for in storage.
+            case_sensitive: Whether the search should be case-sensitive (default false).
+        """
+        assert self._storage_data_loader is not None
+
+        results = self._storage_data_loader.search_values(
+            value=query, case_sensitive=case_sensitive,
+        )
+        return {
+            "query": query,
+            "count": len(results),
+            "results": results[:20],
+        }
+
+    @agent_tool(token_optimized=True)
+    def _trace_recorded_value(
+        self,
+        value: str,
+        case_sensitive: bool = False,
+    ) -> dict[str, Any]:
+        """
+        Search for a value across ALL recorded data sources (network, storage,
+        window properties, DOM). Useful for tracing where a token or value
+        originated from in the captured session.
+
+        Args:
+            value: The value to trace (token, ID, key, etc.).
+            case_sensitive: Whether the search should be case-sensitive (default false).
+        """
+        if not value:
+            return {"error": "value is required"}
+
+        results: dict[str, Any] = {"value_searched": value}
+        found_any = False
+
+        # Network responses
+        if self._network_data_loader:
+            network_hits = self._network_data_loader.search_response_bodies(
+                value=value, case_sensitive=case_sensitive,
+            )
+            results["network"] = {
+                "found": len(network_hits) > 0,
+                "count": len(network_hits),
+                "matches": network_hits[:10],
+            }
+            if network_hits:
+                found_any = True
+        else:
+            results["network"] = {"available": False}
+
+        # Storage
+        if self._storage_data_loader:
+            storage_hits = self._storage_data_loader.search_values(
+                value=value, case_sensitive=case_sensitive,
+            )
+            results["storage"] = {
+                "found": len(storage_hits) > 0,
+                "count": len(storage_hits),
+                "matches": storage_hits[:10],
+            }
+            if storage_hits:
+                found_any = True
+        else:
+            results["storage"] = {"available": False}
+
+        # Window properties
+        if self._window_property_data_loader:
+            window_hits = self._window_property_data_loader.search_values(
+                value=value, case_sensitive=case_sensitive,
+            )
+            results["window_properties"] = {
+                "found": len(window_hits) > 0,
+                "count": len(window_hits),
+                "matches": window_hits[:10],
+            }
+            if window_hits:
+                found_any = True
+        else:
+            results["window_properties"] = {"available": False}
+
+        # DOM strings
+        if self._dom_data_loader:
+            dom_hits = self._dom_data_loader.search_strings(
+                value=value, case_sensitive=case_sensitive,
+            )
+            results["dom"] = {
+                "found": len(dom_hits) > 0,
+                "count": len(dom_hits),
+                "matches": dom_hits[:10],
+            }
+            if dom_hits:
+                found_any = True
+        else:
+            results["dom"] = {"available": False}
+
+        if not found_any and not any(
+            r.get("available", True) for r in [
+                results.get("network", {}),
+                results.get("storage", {}),
+                results.get("window_properties", {}),
+                results.get("dom", {}),
+            ]
+        ):
+            results["warning"] = "No data sources available for tracing"
+
+        return results
+
+    @agent_tool(availability=lambda self: self._dom_data_loader is not None, token_optimized=True)
+    def _get_recorded_dom_snapshot(
+        self,
+        snapshot_index: int = -1,
+    ) -> dict[str, Any]:
+        """
+        Get forms, inputs, meta tags, and scripts from a recorded DOM snapshot.
+
+        Provides a structural overview of what was on the page at capture time.
+
+        Args:
+            snapshot_index: Which snapshot to examine (default -1 = latest).
+        """
+        assert self._dom_data_loader is not None
+
+        idx = snapshot_index if snapshot_index >= 0 else None
+
+        forms = self._dom_data_loader.get_forms(snapshot_index=idx)
+        inputs = self._dom_data_loader.get_inputs(snapshot_index=idx)
+        meta_tags = self._dom_data_loader.get_meta_tags(snapshot_index=idx)
+        scripts = self._dom_data_loader.get_scripts(snapshot_index=idx)
+
+        pages = self._dom_data_loader.list_pages()
+        page_info = None
+        if pages:
+            actual_idx = snapshot_index if snapshot_index >= 0 else len(pages) - 1
+            if 0 <= actual_idx < len(pages):
+                page_info = pages[actual_idx]
+
+        return {
+            "snapshot_index": snapshot_index,
+            "page": page_info,
+            "forms": forms[:10],
+            "inputs": inputs[:30],
+            "meta_tags": meta_tags[:20],
+            "scripts": scripts[:10],
+        }
+
+    @agent_tool(availability=lambda self: self._dom_data_loader is not None, token_optimized=True)
+    def _get_recorded_dom_elements(
+        self,
+        element_type: str,
+        snapshot_index: int | None = None,
+    ) -> dict[str, Any]:
+        """
+        Get specific elements from a recorded DOM snapshot.
+
+        Args:
+            element_type: Type of element to retrieve. One of:
+                "forms", "inputs", "tables", "meta_tags", "scripts",
+                "buttons", "links", "hidden_inputs".
+            snapshot_index: Which snapshot to examine (default: all snapshots).
+        """
+        assert self._dom_data_loader is not None
+
+        type_to_method: dict[str, Callable[..., list[dict[str, Any]]]] = {
+            "forms": self._dom_data_loader.get_forms,
+            "inputs": self._dom_data_loader.get_inputs,
+            "tables": self._dom_data_loader.get_tables,
+            "meta_tags": self._dom_data_loader.get_meta_tags,
+            "scripts": self._dom_data_loader.get_scripts,
+        }
+
+        method = type_to_method.get(element_type)
+        if method is None:
+            # Try get_elements for other types
+            try:
+                results = self._dom_data_loader.get_elements(
+                    element_type=element_type,
+                    snapshot_index=snapshot_index,
+                )
+                return {
+                    "element_type": element_type,
+                    "count": len(results),
+                    "elements": results[:30],
+                }
+            except Exception as e:
+                return {
+                    "error": f"Unknown element_type: {element_type}. "
+                    f"Valid types: {list(type_to_method.keys())}. Error: {e}",
+                }
+
+        results = method(snapshot_index=snapshot_index)
+        return {
+            "element_type": element_type,
+            "count": len(results),
+            "elements": results[:30],
+        }
diff --git a/bluebox/agents/workspace.py b/bluebox/agents/workspace.py
deleted file mode 100644
index 869d3bdb..00000000
--- a/bluebox/agents/workspace.py
+++ /dev/null
@@ -1,224 +0,0 @@
-"""
-bluebox/agents/workspace.py
-
-Abstract workspace interface and local filesystem implementation.
-
-Contains:
-- AgentWorkspace: ABC defining how agents interact with file storage
-- LocalWorkspace: Local filesystem implementation
-"""
-
-from __future__ import annotations
-
-import json
-from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import Any
-
-from bluebox.utils.infra_utils import read_file_lines
-from bluebox.utils.logger import get_logger
-
-logger = get_logger(name=__name__)
-
-
-class AgentWorkspace(ABC):
-    """
-    Abstract workspace that agents use for file I/O.
-
-    A workspace has three logical subdirectories:
-    - raw/    : input data (e.g., routine results saved automatically)
-    - outputs/: agent-generated output files (e.g., CSVs, processed JSON)
-    - context/: reusable context files from successful sessions
-    """
-
-    @property
-    @abstractmethod
-    def root_path(self) -> Path:
-        """Workspace root directory."""
-
-    @abstractmethod
-    def save_file(
-        self,
-        subdirectory: str,
-        filename: str,
-        content: str,
-    ) -> dict[str, str]:
-        """Save content to a file in the workspace.
-
-        Args:
-            subdirectory: Logical subdirectory ("raw", "outputs", or "context").
-            filename: The filename to use (e.g. "routine_result_1.json").
-            content: File content to write.
-
-        Returns:
-            Dict with at least "output_file" key (the saved path).
-        """
-
-    @abstractmethod
-    def read_file(
-        self,
-        path: str,
-        start_line: int | None = None,
-        end_line: int | None = None,
-    ) -> dict[str, Any]:
-        """Read a file by relative path with optional line range.
-
-        Must enforce path traversal protection.
-
-        Args:
-            path: Relative path within the workspace.
-            start_line: Optional 1-based start line.
-            end_line: Optional 1-based end line (inclusive).
-
-        Returns:
-            Dict with "content" and "line_range", or "error".
-        """
-
-    @abstractmethod
-    def list_files(self) -> dict[str, Any]:
-        """List all files in the workspace as a directory tree.
-
-        Returns:
-            Dict with "tree" (str) and "total_files" (int).
-        """
-
-    @abstractmethod
-    def load_raw_json(self) -> list[dict[str, Any]]:
-        """Load all JSON files from the raw/ subdirectory.
-
-        Returns:
-            List of parsed dicts, one per JSON file (sorted by name).
-        """
-
-    @abstractmethod
-    def snapshot_outputs(self) -> dict[str, float]:
-        """Take a snapshot of files in outputs/ (path -> mtime).
-
-        Used before code execution to detect new/modified files afterward.
-        """
-
-    @abstractmethod
-    def diff_outputs(self, before: dict[str, float]) -> list[str]:
-        """Compare current outputs/ against a prior snapshot.
-
-        Args:
-            before: Snapshot from a previous snapshot_outputs() call.
-
-        Returns:
-            List of file paths that are new or modified since the snapshot.
-        """
-
-    @abstractmethod
-    def ensure_dirs(self) -> None:
-        """Ensure the workspace directory structure exists (raw/, outputs/, context/)."""
-
-
-class LocalWorkspace(AgentWorkspace):
-    """Workspace backed by the local filesystem."""
-
-    def __init__(self, workspace_dir: str = "./bluebox_workspace") -> None:
-        self._workspace_dir = Path(workspace_dir)
-        self._raw_dir = self._workspace_dir / "raw"
-        self._outputs_dir = self._workspace_dir / "outputs"
-        self._context_dir = self._workspace_dir / "context"
-        self.ensure_dirs()
-
-    @property
-    def root_path(self) -> Path:
-        return self._workspace_dir
-
-    def save_file(
-        self,
-        subdirectory: str,
-        filename: str,
-        content: str,
-    ) -> dict[str, str]:
-        directory = self._workspace_dir / subdirectory
-        directory.mkdir(parents=True, exist_ok=True)
-        output_path = directory / filename
-        output_path.write_text(content)
-        logger.info("Result saved to %s", output_path)
-        return {"output_file": str(output_path)}
-
-    def read_file(
-        self,
-        path: str,
-        start_line: int | None = None,
-        end_line: int | None = None,
-    ) -> dict[str, Any]:
-        resolved = (self._workspace_dir / path).resolve()
-        workspace_resolved = self._workspace_dir.resolve()
-        try:
-            resolved.relative_to(workspace_resolved)
-        except ValueError:
-            return {"error": f"Access denied: '{path}' is outside the workspace directory"}
-
-        result = read_file_lines(resolved, start_line=start_line, end_line=end_line)
-        result["path"] = path
-        return result
-
-    def list_files(self) -> dict[str, Any]:
-        self._workspace_dir.mkdir(parents=True, exist_ok=True)
-
-        tree_lines: list[str] = []
-        total_files = 0
-
-        for dirpath, dirnames, filenames in sorted(self._workspace_dir.walk()):
-            rel_dir = dirpath.relative_to(self._workspace_dir)
-            depth = len(rel_dir.parts)
-            indent = "  " * depth
-            dir_name = rel_dir.name or str(self._workspace_dir.name)
-            tree_lines.append(f"{indent}{dir_name}/")
-
-            dirnames.sort()
-            for filename in sorted(filenames):
-                filepath = dirpath / filename
-                size = filepath.stat().st_size
-                if size < 1024:
-                    size_str = f"{size}B"
-                elif size < 1024 * 1024:
-                    size_str = f"{size / 1024:.1f}KB"
-                else:
-                    size_str = f"{size / (1024 * 1024):.1f}MB"
-                tree_lines.append(f"{indent}  {filename}  ({size_str})")
-                total_files += 1
-
-        return {
-            "tree": "\n".join(tree_lines),
-            "total_files": total_files,
-        }
-
-    def load_raw_json(self) -> list[dict[str, Any]]:
-        self._raw_dir.mkdir(parents=True, exist_ok=True)
-        results: list[dict[str, Any]] = []
-        for json_file in sorted(self._raw_dir.glob("*.json")):
-            try:
-                data = json.loads(json_file.read_text())
-                results.append(data)
-            except (json.JSONDecodeError, OSError) as e:
-                logger.warning("Failed to load %s: %s", json_file, e)
-        return results
-
-    def snapshot_outputs(self) -> dict[str, float]:
-        self._outputs_dir.mkdir(parents=True, exist_ok=True)
-        snapshot: dict[str, float] = {}
-        for p in self._outputs_dir.iterdir():
-            if p.is_file():
-                snapshot[str(p)] = p.stat().st_mtime
-        return snapshot
-
-    def diff_outputs(self, before: dict[str, float]) -> list[str]:
-        changed: list[str] = []
-        for p in self._outputs_dir.iterdir():
-            if not p.is_file():
-                continue
-            path_str = str(p)
-            mtime = p.stat().st_mtime
-            if path_str not in before or before[path_str] < mtime:
-                changed.append(path_str)
-        return changed
-
-    def ensure_dirs(self) -> None:
-        self._raw_dir.mkdir(parents=True, exist_ok=True)
-        self._outputs_dir.mkdir(parents=True, exist_ok=True)
-        self._context_dir.mkdir(parents=True, exist_ok=True)
diff --git a/bluebox/cdp/connection.py b/bluebox/cdp/connection.py
index 2383fc74..94f029f7 100644
--- a/bluebox/cdp/connection.py
+++ b/bluebox/cdp/connection.py
@@ -12,6 +12,7 @@
 import contextvars
 import json
 import time
+from dataclasses import dataclass
 from json import JSONDecodeError
 from typing import Callable
 from urllib.parse import urlparse, urlunparse
@@ -154,8 +155,20 @@ def send_cmd(
 
     def recv_json(ws_conn: WebSocket, deadline: float) -> dict:
         """Read a single JSON message from WebSocket, skipping empty/non-JSON frames."""
-        while time.time() < deadline:
-            raw = ws_conn.recv()
+        while True:
+            remaining = deadline - time.time()
+            if remaining <= 0:
+                break
+            try:
+                # Override the socket timeout per read so the caller's deadline
+                # (not the connect-time socket timeout) controls command waits.
+                ws_conn.settimeout(min(remaining, 1.0))
+                raw = ws_conn.recv()
+            except websocket.WebSocketTimeoutException:
+                # No frame yet; keep waiting until deadline.
+                continue
+            except Exception as e:
+                raise RuntimeError(f"CDP WebSocket receive failed: {e}") from e
             if not raw:
                 continue
             try:
@@ -178,6 +191,18 @@ def recv_until(predicate: Callable[[dict], bool], deadline: float) -> dict:
 # Tab/context management __________________________________________________________________________
 
 
+@dataclass
+class CDPTargetSession:
+    """Browser target + attached flattened CDP session."""
+
+    target_id: str
+    browser_context_id: str | None
+    browser_ws: WebSocket
+    session_id: str
+    send_cmd: Callable
+    recv_until: Callable
+
+
 def get_existing_tabs(remote_debugging_address: str) -> list[dict]:
     """
     Get list of existing browser tabs/targets.
@@ -199,6 +224,100 @@ def get_existing_tabs(remote_debugging_address: str) -> list[dict]:
         raise RuntimeError(f"Failed to get existing tabs: {e}")
 
 
+def cdp_attach_and_enable(
+    browser_ws: WebSocket,
+    target_id: str,
+    *,
+    enable_domains: tuple[str, ...] = ("Page", "Runtime"),
+    timeout_seconds: float = 10.0,
+) -> tuple[str, Callable, Callable]:
+    """
+    Attach to a target with flattened session and enable requested domains.
+
+    Args:
+        browser_ws: Browser-level WebSocket.
+        target_id: Target/tab ID to attach.
+        enable_domains: CDP domains to enable (e.g., ("Page", "Runtime")).
+        timeout_seconds: Timeout per command.
+
+    Returns:
+        Tuple of (session_id, send_cmd, recv_until).
+
+    Raises:
+        RuntimeError: If attach/enable fails.
+    """
+    send_cmd, _, recv_until = create_cdp_helpers(browser_ws)
+
+    attach_id = send_cmd(
+        "Target.attachToTarget",
+        {"targetId": target_id, "flatten": True},
+    )
+    attach_reply = recv_until(lambda m: m.get("id") == attach_id, time.time() + timeout_seconds)
+    if "error" in attach_reply:
+        raise RuntimeError(f"Failed to attach to browser tab: {attach_reply['error']}")
+
+    session_id = attach_reply.get("result", {}).get("sessionId")
+    if not session_id:
+        raise RuntimeError("No sessionId returned from Target.attachToTarget")
+
+    for domain in enable_domains:
+        enable_id = send_cmd(f"{domain}.enable", session_id=session_id)
+        enable_reply = recv_until(lambda m: m.get("id") == enable_id, time.time() + timeout_seconds)
+        if "error" in enable_reply:
+            raise RuntimeError(f"Failed to enable {domain}: {enable_reply['error']}")
+
+    return session_id, send_cmd, recv_until
+
+
+def cdp_open_new_tab_session(
+    remote_debugging_address: str = "http://127.0.0.1:9222",
+    *,
+    incognito: bool = True,
+    url: str = "about:blank",
+    proxy_address: str | None = None,
+    enable_domains: tuple[str, ...] = ("Page", "Runtime"),
+    timeout_seconds: float = 10.0,
+) -> CDPTargetSession:
+    """
+    Create a new tab, attach a flattened session, and enable requested domains.
+
+    Raises:
+        RuntimeError: If tab/session setup fails.
+    """
+    target_id, browser_context_id, browser_ws = cdp_new_tab(
+        remote_debugging_address=remote_debugging_address,
+        incognito=incognito,
+        url=url,
+        proxy_address=proxy_address,
+    )
+    try:
+        session_id, send_cmd, recv_until = cdp_attach_and_enable(
+            browser_ws=browser_ws,
+            target_id=target_id,
+            enable_domains=enable_domains,
+            timeout_seconds=timeout_seconds,
+        )
+        return CDPTargetSession(
+            target_id=target_id,
+            browser_context_id=browser_context_id,
+            browser_ws=browser_ws,
+            session_id=session_id,
+            send_cmd=send_cmd,
+            recv_until=recv_until,
+        )
+    except Exception:
+        try:
+            cdp_close_tab_session(
+                target_id=target_id,
+                browser_context_id=browser_context_id,
+                browser_ws=browser_ws,
+                remote_debugging_address=remote_debugging_address,
+            )
+        except Exception:
+            pass
+        raise
+
+
 def cdp_attach_to_existing_tab(
     remote_debugging_address: str = "http://127.0.0.1:9222",
     target_id: str | None = None,
@@ -330,6 +449,47 @@ def cdp_new_tab(
         raise RuntimeError(f"Failed to create target: {e}")
 
 
+def cdp_close_tab_session(
+    *,
+    target_id: str | None,
+    browser_context_id: str | None,
+    browser_ws: WebSocket | None,
+    remote_debugging_address: str | None = None,
+) -> None:
+    """
+    Best-effort close target, dispose context, and close browser WebSocket.
+    """
+    if browser_ws is not None and target_id:
+        try:
+            send_cmd, _, _ = create_cdp_helpers(browser_ws)
+            send_cmd("Target.closeTarget", {"targetId": target_id})
+        except Exception as e:
+            logger.debug("Failed to close target %s: %s", target_id, e)
+
+    if browser_context_id:
+        try:
+            if browser_ws is not None:
+                dispose_context(browser_context_id=browser_context_id, ws=browser_ws)
+            elif remote_debugging_address:
+                dispose_context(
+                    browser_context_id=browser_context_id,
+                    remote_debugging_address=remote_debugging_address,
+                )
+            else:
+                logger.debug(
+                    "Skipping context dispose for %s (no ws or remote_debugging_address)",
+                    browser_context_id,
+                )
+        except Exception as e:
+            logger.debug("Failed to dispose context %s: %s", browser_context_id, e)
+
+    if browser_ws is not None:
+        try:
+            browser_ws.close()
+        except Exception as e:
+            logger.debug("Failed to close browser websocket: %s", e)
+
+
 def dispose_context(
     browser_context_id: str,
     ws: WebSocket | None = None,
diff --git a/bluebox/data_models/agents/context.py b/bluebox/data_models/agents/context.py
index f5f2f329..8226bf87 100644
--- a/bluebox/data_models/agents/context.py
+++ b/bluebox/data_models/agents/context.py
@@ -73,7 +73,7 @@ class BlueBoxAgentContext(BaseModel):
     )
     output_files: list[str] = Field(
         default_factory=list,
-        description="Relative paths of output files written to outputs/",
+        description="Relative paths of output files written to output/",
     )
     output_description: str = Field(
         ...,
diff --git a/bluebox/data_models/agents/workspace.py b/bluebox/data_models/agents/workspace.py
new file mode 100644
index 00000000..ab73b584
--- /dev/null
+++ b/bluebox/data_models/agents/workspace.py
@@ -0,0 +1,71 @@
+"""
+bluebox/data_models/agents/workspace.py
+
+Pydantic models for the v2 artifact-oriented workspace.
+
+Contains:
+- ArtifactRef / ArtifactManifestEntry: immutable artifact identity and provenance
+- MountedInputRef: immutable identity and provenance for mounted input files
+- WorkspaceFileState / WorkspaceSnapshot / WorkspaceDelta: snapshot/diff primitives
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from typing import Any, Literal
+
+from pydantic import BaseModel, Field
+
+ArtifactSource = Literal["raw", "output", "context"]
+
+
+class ArtifactRef(BaseModel):
+    artifact_id: str
+    index: int
+    source: ArtifactSource
+    relative_path: str
+    size_bytes: int
+    content_type: str
+    summary: str = ""
+    created_at: str
+    sha256: str | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
+
+
+class ArtifactManifestEntry(BaseModel):
+    index: int
+    artifact: ArtifactRef
+    tool_name: str | None = None
+    code_run_id: str | None = None
+
+
+class MountedInputRef(BaseModel):
+    mount_id: str
+    name: str
+    source_path: str
+    relative_path: str
+    mode: Literal["hardlink"] = "hardlink"
+    size_bytes: int
+    mtime_ns: int
+    created_at: str
+    metadata: dict[str, Any] = Field(default_factory=dict)
+
+
+class WorkspaceFileState(BaseModel):
+    relative_path: str
+    size_bytes: int
+    mtime_ns: int
+
+
+class WorkspaceSnapshot(BaseModel):
+    roots: list[str] = Field(default_factory=list)
+    files: dict[str, WorkspaceFileState] = Field(default_factory=dict)
+    captured_at: str = Field(
+        default_factory=lambda: datetime.now(timezone.utc).isoformat()
+    )
+
+
+class WorkspaceDelta(BaseModel):
+    created: list[WorkspaceFileState] = Field(default_factory=list)
+    modified: list[WorkspaceFileState] = Field(default_factory=list)
+    deleted: list[str] = Field(default_factory=list)
diff --git a/bluebox/data_models/api_indexing/__init__.py b/bluebox/data_models/api_indexing/__init__.py
new file mode 100644
index 00000000..7432d637
--- /dev/null
+++ b/bluebox/data_models/api_indexing/__init__.py
@@ -0,0 +1,3 @@
+"""
+Data models for the API indexing pipeline.
+"""
diff --git a/bluebox/data_models/api_indexing/exploration.py b/bluebox/data_models/api_indexing/exploration.py
new file mode 100644
index 00000000..9eb1b344
--- /dev/null
+++ b/bluebox/data_models/api_indexing/exploration.py
@@ -0,0 +1,278 @@
+"""
+Data models for Phase 1: Exploration.
+
+The exploration phase analyzes raw captured session data across all domains
+(network, storage, DOM, interactions) and produces high-level summaries
+of what exists. The goal is to filter thousands of raw events down to
+the 5-15 endpoints that actually matter, ranked by utility.
+"""
+
+from enum import Enum
+
+from pydantic import BaseModel, Field
+
+
+# ---------------------------------------------------------------------------
+# Network domain
+# ---------------------------------------------------------------------------
+
+
+class EndpointCategory(str, Enum):
+    """How an endpoint is used."""
+
+    ACTION = "action"          # POSTs/PUTs that do something (search, submit, create)
+    DATA = "data"              # GETs that return meaningful data (user info, config, prices)
+    AUTH = "auth"              # Token endpoints, login, refresh, CSRF fetches
+    NAVIGATION = "navigation"  # HTML page loads, redirects
+
+
+class InterestLevel(str, Enum):
+    """How relevant an endpoint is for routine construction."""
+
+    HIGH = "high"      # Core business endpoints the routine will call
+    MEDIUM = "medium"  # Supporting endpoints (reference data, config, session mgmt)
+    LOW = "low"        # Noise (analytics, tracking, ads, bot detection)
+
+
+class EndpointCluster(BaseModel):
+    """A group of requests to the same logical endpoint."""
+
+    url_pattern: str = Field(
+        description="Deduplicated URL pattern, e.g. '/api/v2/search'"
+    )
+    method: str = Field(
+        description="HTTP method, e.g. 'POST'"
+    )
+    category: EndpointCategory = Field(
+        description="What role this endpoint plays"
+    )
+    hit_count: int = Field(
+        description="How many times this endpoint was called in the session"
+    )
+    description: str = Field(
+        description="What this endpoint does, e.g. 'Train route search — returns JSON with schedules and prices'"
+    )
+    interest: InterestLevel = Field(
+        description="How relevant this endpoint is for routine construction"
+    )
+    request_ids: list[str] = Field(
+        default_factory=list,
+        description="References to raw captured request IDs for drilling into details later"
+    )
+
+
+class NetworkExplorationSummary(BaseModel):
+    """High-level summary of the network domain after exploration."""
+
+    total_requests: int = Field(
+        description="Total number of requests in the capture"
+    )
+    endpoints: list[EndpointCluster] = Field(
+        default_factory=list,
+        description="Discovered endpoints, sorted by interest level (high first)"
+    )
+    auth_observations: list[str] = Field(
+        default_factory=list,
+        description="Observed auth patterns, e.g. 'Bearer JWT on all /api/ requests', 'CSRF header on POSTs'"
+    )
+    narrative: str = Field(
+        default="",
+        description="Free-form observations: oddities, patterns, anything that doesn't fit the structured fields"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Storage domain
+# ---------------------------------------------------------------------------
+
+
+class StorageExplorationSummary(BaseModel):
+    """High-level summary of the storage domain after exploration.
+
+    The storage explorer scans all browser storage (cookies, localStorage,
+    sessionStorage, IndexedDB) for two things that matter:
+    1. Tokens — auth tokens, CSRF tokens, API keys, session IDs
+    2. Data blocks — large structured values cached client-side
+    Everything else is noise.
+    """
+
+    total_events: int = Field(
+        description="Total number of storage events in the capture"
+    )
+    noise_filtered: int = Field(
+        description="Events discarded as noise "
+                    "(tracking cookies, analytics IDs, consent flags, etc.)"
+    )
+    tokens: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Each entry is a freeform description of one discovered token. "
+            "Include: where it lives (cookie/localStorage/sessionStorage), "
+            "the key name, what kind of token it looks like (JWT, session ID, "
+            "CSRF, API key), rough size, and whether it changed during the session. "
+            "e.g. sessionStorage[auth_token] -- JWT (~1.2kb), written once on "
+            "page load, likely used as Bearer header"
+        )
+    )
+    data_blocks: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Each entry is a freeform description of one meaningful data block. "
+            "Include: where it lives, the key name, what it contains, rough size. "
+            "e.g. localStorage[user_profile] -- JSON object (~2kb) with name, "
+            "email, subscription tier, preferences. Written once."
+        )
+    )
+    narrative: str = Field(
+        default="",
+        description="Freeform observations: auth patterns, storage lifecycle, "
+                    "oddities, cross-domain connections, anything else worth noting"
+    )
+
+
+# ---------------------------------------------------------------------------
+# DOM domain
+# ---------------------------------------------------------------------------
+
+
+class DOMExplorationSummary(BaseModel):
+    """High-level summary of the DOM domain after exploration.
+
+    The DOM explorer scans all captured browser snapshots to extract:
+    1. Pages — what pages were visited, their structure and purpose
+    2. Forms — what forms exist, their action URLs, methods, and fields
+    3. Embedded tokens — CSRF tokens, API keys, session IDs found in
+       meta tags, hidden inputs, or script blocks
+    4. Data blobs — server-side data rendered into the DOM
+       (__NEXT_DATA__, ld+json, inline JSON config)
+    5. Tables — data tables with their columns and what they display
+    6. Framework — what frontend stack the site uses (Next.js, Nuxt,
+       ASP.NET, Angular, etc.) inferred from DOM signals
+    """
+
+    total_snapshots: int = Field(
+        description="Total number of DOM snapshots in the capture"
+    )
+    pages: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Each entry is a freeform description of one visited page. "
+            "Include: URL, page title, key sections, overall purpose. "
+            "e.g. '/book/flights' (Book Flights) -- flight search page "
+            "with search form, promotional banners, and fare calendar."
+        )
+    )
+    forms: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Each entry is a freeform description of one form. "
+            "Include: which page it appears on, action URL, HTTP method, "
+            "what fields it contains (name, type, placeholder), what it does. "
+            "e.g. '/book/flights' search form -- POST to /api/search, "
+            "fields: origin (text), destination (text), date (date), "
+            "passengers (select 1-9). Submit button: 'Search Flights'."
+        )
+    )
+    embedded_tokens: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Each entry is a freeform description of one token/key found in the DOM. "
+            "Include: where it was found (meta tag, hidden input, script block), "
+            "the name/key, what it looks like (CSRF, API key, session ID, JWT), "
+            "rough size, whether it changes across snapshots. "
+            "e.g. meta[name=csrf-token] -- 64-char hex string, rotates per page load. "
+            "e.g. hidden input '_RequestVerificationToken' in login form -- ASP.NET anti-forgery token."
+        )
+    )
+    data_blobs: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Each entry is a freeform description of one server-side data blob "
+            "embedded in the DOM. Include: what script/element contains it, "
+            "what data it holds, rough size, what keys are interesting. "
+            "e.g. script#__NEXT_DATA__ -- JSON (~15kb) with props.pageProps "
+            "containing station list, feature flags, and user session. "
+            "e.g. script[type=application/ld+json] -- structured data with "
+            "organization name, logo, social links."
+        )
+    )
+    tables: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Each entry is a freeform description of one data table. "
+            "Include: which page it appears on, column headers, row count, "
+            "what data it displays. "
+            "e.g. '/results' flight results table -- columns: Departure, "
+            "Arrival, Duration, Price. 12 rows of flight options."
+        )
+    )
+    inferred_framework: str = Field(
+        default="",
+        description=(
+            "Frontend framework inferred from DOM signals. "
+            "e.g. 'Next.js', 'Angular', 'React SPA', 'ASP.NET', 'vanilla/unknown'."
+        )
+    )
+    narrative: str = Field(
+        default="",
+        description=(
+            "Freeform observations: navigation flow, page transitions, "
+            "dynamic content patterns, anything else worth noting."
+        )
+    )
+
+
+# ---------------------------------------------------------------------------
+# UI / Interaction domain
+# ---------------------------------------------------------------------------
+
+
+class UIExplorationSummary(BaseModel):
+    """High-level summary of user interactions after exploration.
+
+    The UI explorer analyzes recorded interaction events (clicks, typed
+    values, selections) and optionally cross-references with DOM snapshots
+    to understand what the user did and what they were trying to accomplish.
+    """
+
+    total_events: int = Field(
+        description="Total number of interaction events in the capture"
+    )
+    user_inputs: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Each entry describes one user input action. "
+            "Include: element (tag, id/name), value entered/selected, input type. "
+            "e.g. input#origin (text) -- user typed 'LAX', airport code parameter."
+        )
+    )
+    clicks: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Each entry describes one meaningful click action. "
+            "Include: element (tag, id/class/text), what it does, which page. "
+            "e.g. button.search-btn 'Search Flights' on /book/flights -- submits search form."
+        )
+    )
+    navigation_flow: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Ordered list of page transitions the user made. "
+            "e.g. '/ (Home) -> /book/flights (Search) -> /results (Results)'."
+        )
+    )
+    inferred_intent: str = Field(
+        default="",
+        description=(
+            "What was the user trying to accomplish? Infer from pages visited, "
+            "forms filled, values entered, and navigation flow. "
+            "e.g. 'Search for one-way flights from ATL to LAX on 2025-03-15'."
+        )
+    )
+    narrative: str = Field(
+        default="",
+        description=(
+            "Freeform observations: interaction patterns, user behavior, "
+            "anything else worth noting about what the user did."
+        )
+    )
diff --git a/bluebox/data_models/api_indexing/phases.py b/bluebox/data_models/api_indexing/phases.py
new file mode 100644
index 00000000..e80812be
--- /dev/null
+++ b/bluebox/data_models/api_indexing/phases.py
@@ -0,0 +1,11 @@
+"""
+Phases for the API indexing pipeline.
+"""
+
+from enum import Enum
+
+
+class ApiIndexingPhase(str, Enum):
+    """The current phase of the API indexing pipeline."""
+
+    EXPLORATION = "exploration"
diff --git a/bluebox/data_models/orchestration/__init__.py b/bluebox/data_models/orchestration/__init__.py
index 52d490bf..ef1d5b29 100644
--- a/bluebox/data_models/orchestration/__init__.py
+++ b/bluebox/data_models/orchestration/__init__.py
@@ -1,9 +1,32 @@
 """
 bluebox/data_models/orchestration
 
-Data models for orchestration: tasks, subagents, state, and results.
+Data models for orchestration: tasks, subagents, state, results, experiments,
+inspection, and the Discovery Ledger.
 """
 
+from bluebox.data_models.orchestration.experiment import (
+    ArtifactType,
+    ExperimentEntry,
+    ExperimentLog,
+    ExperimentStatus,
+    ExperimentTakeaway,
+    ExperimentVerdict,
+    ProvenArtifacts,
+)
+from bluebox.data_models.orchestration.inspection import (
+    DimensionScore,
+    RoutineInspectionResult,
+)
+from bluebox.data_models.orchestration.ledger import (
+    DiscoveryLedger,
+    RoutineAttempt,
+    RoutineAttemptStatus,
+    RoutineCatalog,
+    RoutineSpec,
+    RoutineSpecStatus,
+    ShippedRoutine,
+)
 from bluebox.data_models.orchestration.result import SpecialistResultWrapper
 from bluebox.data_models.orchestration.state import AgentOrchestrationState
 from bluebox.data_models.orchestration.task import (
@@ -16,7 +39,23 @@
 
 __all__ = [
     "AgentOrchestrationState",
+    "ArtifactType",
+    "DimensionScore",
+    "DiscoveryLedger",
+    "ExperimentEntry",
+    "ExperimentLog",
+    "ExperimentStatus",
+    "ExperimentTakeaway",
+    "ExperimentVerdict",
     "generate_short_id",
+    "ProvenArtifacts",
+    "RoutineAttempt",
+    "RoutineAttemptStatus",
+    "RoutineCatalog",
+    "RoutineInspectionResult",
+    "RoutineSpec",
+    "RoutineSpecStatus",
+    "ShippedRoutine",
     "SpecialistAgentType",
     "SpecialistResultWrapper",
     "SubAgent",
diff --git a/bluebox/data_models/orchestration/experiment.py b/bluebox/data_models/orchestration/experiment.py
new file mode 100644
index 00000000..92e582ce
--- /dev/null
+++ b/bluebox/data_models/orchestration/experiment.py
@@ -0,0 +1,188 @@
+"""
+bluebox/data_models/orchestration/experiment.py
+
+Experiment log models for the PrincipalInvestigator orchestrator.
+
+The experiment log is the PI's working memory — it tracks hypotheses,
+results, proven artifacts, and unresolved questions across the entire
+routine construction process.
+"""
+
+from enum import StrEnum
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from bluebox.data_models.orchestration.task import generate_short_id
+
+
+class ExperimentVerdict(StrEnum):
+    """Verdict after reviewing an experiment result."""
+    CONFIRMED = "confirmed"
+    REFUTED = "refuted"
+    PARTIAL = "partial"
+    NEEDS_FOLLOWUP = "needs_followup"
+
+
+class ExperimentStatus(StrEnum):
+    """Status of an experiment."""
+    PENDING = "pending"
+    RUNNING = "running"
+    DONE = "done"
+    FAILED = "failed"
+
+
+class ExperimentTakeaway(BaseModel):
+    """Reusable lesson extracted from an experiment result."""
+
+    claim: str = Field(description="Concrete claim learned from the experiment")
+    evidence: str | None = Field(
+        default=None,
+        description="Evidence supporting the claim (endpoint/status/value/log snippet)",
+    )
+    how_to_apply_next: str | None = Field(
+        default=None,
+        description="How future workers should apply this claim",
+    )
+    confidence: float | None = Field(
+        default=None,
+        description="Confidence in [0, 1]",
+    )
+    tags: list[str] = Field(
+        default_factory=list,
+        description="Tags like auth, pagination, anti_bot, endpoint, parameter",
+    )
+
+
+class ExperimentEntry(BaseModel):
+    """A single experiment in the PI's log."""
+
+    id: str = Field(default_factory=generate_short_id)
+    hypothesis: str = Field(description="Specific, falsifiable hypothesis being tested")
+    rationale: str = Field(description="Why we're testing this — evidence, reasoning, expectations")
+    methodology: str = Field(description="Experiment methodology sent to the worker")
+    routine_spec_id: str | None = Field(
+        default=None,
+        description="RoutineSpec this experiment supports; None for shared/global experiments",
+    )
+    task_id: str | None = Field(default=None, description="Reference to the dispatched Task")
+    status: ExperimentStatus = Field(default=ExperimentStatus.PENDING)
+    verdict: ExperimentVerdict | None = Field(default=None)
+    summary: str | None = Field(default=None, description="What we learned (recorded by PI)")
+    takeaways: list[ExperimentTakeaway] = Field(
+        default_factory=list,
+        description="Reusable lessons that future workers can apply",
+    )
+    output: dict[str, Any] | None = Field(default=None, description="Raw worker output")
+
+
+class ArtifactType(StrEnum):
+    """Types of proven artifacts."""
+    FETCH = "fetch"
+    NAVIGATION = "navigation"
+    TOKEN = "token"
+    PARAMETER = "parameter"
+
+
+class ProvenArtifacts(BaseModel):
+    """Accumulates proven artifacts from confirmed experiments."""
+
+    fetches: list[dict[str, Any]] = Field(
+        default_factory=list,
+        description="Proven API calls: url, method, headers, body, response_preview",
+    )
+    navigations: list[dict[str, Any]] = Field(
+        default_factory=list,
+        description="Proven navigation steps: url, sets_up (cookies, storage_keys)",
+    )
+    tokens: list[dict[str, Any]] = Field(
+        default_factory=list,
+        description="Proven tokens: name, source, storage_type, key_name",
+    )
+    parameters: list[dict[str, Any]] = Field(
+        default_factory=list,
+        description="Proven user parameters: name, type, description, example_value",
+    )
+
+
+class ExperimentLog(BaseModel):
+    """
+    The PI's working memory.
+
+    Tracks all experiments, their results, proven artifacts, and
+    unresolved questions. Injected into the PI's system prompt so it
+    can track progress across iterations.
+    """
+
+    user_task: str = Field(description="The user's original task description")
+    experiments: list[ExperimentEntry] = Field(default_factory=list)
+    proven: ProvenArtifacts = Field(default_factory=ProvenArtifacts)
+    unresolved: list[str] = Field(
+        default_factory=list,
+        description="Questions we still don't have answers to",
+    )
+
+    # Convenience methods
+
+    def add_experiment(self, experiment: ExperimentEntry) -> ExperimentEntry:
+        """Add an experiment to the log."""
+        self.experiments.append(experiment)
+        return experiment
+
+    def get_experiment(self, experiment_id: str) -> ExperimentEntry | None:
+        """Find an experiment by ID."""
+        for exp in self.experiments:
+            if exp.id == experiment_id:
+                return exp
+        return None
+
+    def get_running_experiments(self) -> list[ExperimentEntry]:
+        """Get all currently running experiments."""
+        return [e for e in self.experiments if e.status == ExperimentStatus.RUNNING]
+
+    def get_confirmed_experiments(self) -> list[ExperimentEntry]:
+        """Get all confirmed experiments."""
+        return [e for e in self.experiments if e.verdict == ExperimentVerdict.CONFIRMED]
+
+    def to_summary(self) -> str:
+        """Render the experiment log as a text summary for the PI's system prompt."""
+        lines: list[str] = []
+
+        # Experiments
+        if self.experiments:
+            lines.append("## Experiment History")
+            for exp in self.experiments:
+                status_str = f"[{exp.status.value}]"
+                if exp.verdict:
+                    status_str += f" verdict={exp.verdict.value}"
+                lines.append(f"- {exp.id}: {status_str} {exp.hypothesis}")
+                if exp.summary:
+                    lines.append(f"  → {exp.summary}")
+                if exp.takeaways:
+                    first_claim = exp.takeaways[0].claim.strip()
+                    preview = first_claim[:120] + ("..." if len(first_claim) > 120 else "")
+                    lines.append(f"  → takeaways={len(exp.takeaways)} (e.g. {preview})")
+            lines.append("")
+
+        # Proven artifacts
+        proven = self.proven
+        if proven.fetches or proven.navigations or proven.tokens or proven.parameters:
+            lines.append("## Proven Artifacts")
+            for f in proven.fetches:
+                lines.append(f"- FETCH: {f.get('method', '?')} {f.get('url', '?')}")
+            for n in proven.navigations:
+                lines.append(f"- NAV: {n.get('url', '?')}")
+            for t in proven.tokens:
+                lines.append(f"- TOKEN: {t.get('name', '?')} from {t.get('source', '?')}")
+            for p in proven.parameters:
+                lines.append(f"- PARAM: {p.get('name', '?')} ({p.get('type', '?')})")
+            lines.append("")
+
+        # Unresolved
+        if self.unresolved:
+            lines.append("## Unresolved Questions")
+            for q in self.unresolved:
+                lines.append(f"- {q}")
+            lines.append("")
+
+        return "\n".join(lines) if lines else "(no experiments yet)"
diff --git a/bluebox/data_models/orchestration/inspection.py b/bluebox/data_models/orchestration/inspection.py
new file mode 100644
index 00000000..6eb93b14
--- /dev/null
+++ b/bluebox/data_models/orchestration/inspection.py
@@ -0,0 +1,77 @@
+"""
+bluebox/data_models/orchestration/inspection.py
+
+Data models for the RoutineInspector — the independent quality gate
+that judges routines after construction.
+"""
+
+from pydantic import BaseModel, Field, field_validator
+
+
+class DimensionScore(BaseModel):
+    """Score for a single quality dimension."""
+
+    score: int = Field(ge=0, description="Score from 0 (terrible) to 10 (perfect)")
+    reasoning: str = Field(description="Why this score was given")
+
+    @field_validator("score", mode="before")
+    @classmethod
+    def _clamp_score(cls, value: object) -> object:
+        """
+        Clamp model-provided scores into the valid [0, 10] range.
+
+        This prevents minor over/under-shoots (e.g. 11) from propagating.
+        """
+        if isinstance(value, bool):
+            return int(value)
+        if isinstance(value, (int, float, str)):
+            try:
+                as_int = int(value)
+            except (TypeError, ValueError):
+                return value
+            return max(0, min(10, as_int))
+        return value
+
+
+class RoutineInspectionDimensions(BaseModel):
+    """Required quality dimensions for routine inspection."""
+
+    task_completion: DimensionScore = Field(description="How well the routine fulfills its stated purpose")
+    data_quality: DimensionScore = Field(description="Quality and usefulness of returned data")
+    parameter_coverage: DimensionScore = Field(description="Whether parameterization is complete and appropriate")
+    routine_robustness: DimensionScore = Field(description="Resilience across sessions and site variance")
+    structural_correctness: DimensionScore = Field(description="Correct operation ordering and dependency structure")
+    documentation_quality: DimensionScore = Field(description="Naming/description/parameter-doc quality for discoverability")
+
+
+class RoutineInspectionResult(BaseModel):
+    """
+    Output of the RoutineInspector.
+
+    The inspector scores the routine on 6 dimensions (0-10 each),
+    identifies blocking issues vs. non-blocking recommendations,
+    and renders a pass/fail verdict.
+    """
+
+    overall_pass: bool = Field(
+        description="Whether the routine should ship (True) or needs fixes (False)"
+    )
+    overall_score: int = Field(
+        ge=0,
+        le=100,
+        description="Round(sum of 6 dimension scores / 60 * 100)",
+    )
+    dimensions: RoutineInspectionDimensions = Field(
+        description="Scores per required quality dimension",
+    )
+    blocking_issues: list[str] = Field(
+        default_factory=list,
+        description="Issues that MUST be fixed before shipping",
+    )
+    recommendations: list[str] = Field(
+        default_factory=list,
+        description="Issues that SHOULD be fixed but are non-blocking",
+    )
+    summary: str = Field(
+        description="2-3 sentence overall assessment",
+    )
diff --git a/bluebox/data_models/orchestration/ledger.py b/bluebox/data_models/orchestration/ledger.py
new file mode 100644
index 00000000..e962a9b7
--- /dev/null
+++ b/bluebox/data_models/orchestration/ledger.py
@@ -0,0 +1,399 @@
+"""
+bluebox/data_models/orchestration/ledger.py
+
+Discovery Ledger — the PI's pipeline-level tracker for multi-routine
+catalog construction.
+
+Tracks:
+- What routines the PI plans to build (RoutineSpec catalog plan)
+- All experiments across all routines (shared pool)
+- All routine attempts, executions, and inspections
+- The final deliverable: a RoutineCatalog of shipped routines
+"""
+
+from __future__ import annotations
+
+from enum import StrEnum
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from bluebox.data_models.orchestration.experiment import (
+    ExperimentEntry,
+    ExperimentStatus,
+    ExperimentVerdict,
+    ProvenArtifacts,
+)
+from bluebox.data_models.orchestration.inspection import RoutineInspectionResult
+from bluebox.data_models.orchestration.task import generate_short_id
+
+
+# ---------------------------------------------------------------------------
+# Routine lifecycle
+# ---------------------------------------------------------------------------
+
+class RoutineSpecStatus(StrEnum):
+    """Status of a planned routine."""
+    PLANNED = "planned"
+    EXPERIMENTING = "experimenting"
+    ASSEMBLING = "assembling"
+    VALIDATING = "validating"
+    SHIPPED = "shipped"
+    FAILED = "failed"
+
+
+class RoutineAttemptStatus(StrEnum):
+    """Status of a routine attempt."""
+    DRAFT = "draft"
+    VALIDATING = "validating"
+    EXECUTING = "executing"
+    INSPECTING = "inspecting"
+    PASSED = "passed"
+    FAILED = "failed"
+
+
+# ---------------------------------------------------------------------------
+# Routine planning — what to build
+# ---------------------------------------------------------------------------
+
+class RoutineSpec(BaseModel):
+    """A planned routine in the catalog."""
+
+    id: str = Field(default_factory=generate_short_id)
+    name: str = Field(description="Short name e.g. 'get_league_standings'")
+    description: str = Field(description="What this routine does")
+    status: RoutineSpecStatus = Field(default=RoutineSpecStatus.PLANNED)
+    priority: int = Field(default=1, description="1=must-have, 2=should-have, 3=nice-to-have")
+    depends_on_specs: list[str] = Field(
+        default_factory=list,
+        description="Other RoutineSpec IDs that share dependencies (e.g. same auth)",
+    )
+    experiment_ids: list[str] = Field(
+        default_factory=list,
+        description="IDs of experiments run for this routine",
+    )
+    attempt_ids: list[str] = Field(
+        default_factory=list,
+        description="IDs of routine attempts for this routine",
+    )
+    shipped_attempt_id: str | None = Field(
+        default=None,
+        description="The attempt that passed inspection and was shipped",
+    )
+    failure_reason: str | None = Field(
+        default=None,
+        description="Why the routine was abandoned (if status=failed)",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Routine attempts — per routine spec
+# ---------------------------------------------------------------------------
+
+class RoutineAttempt(BaseModel):
+    """A single attempt at constructing a routine for a spec."""
+
+    id: str = Field(default_factory=generate_short_id)
+    routine_spec_id: str = Field(description="Which RoutineSpec this attempt is for")
+    routine_json: dict[str, Any] = Field(description="The full routine as a dict")
+    status: RoutineAttemptStatus = Field(default=RoutineAttemptStatus.DRAFT)
+
+    # Execution
+    test_parameters: dict[str, Any] | None = Field(default=None)
+    execution_result: dict[str, Any] | None = Field(default=None)
+    execution_error: str | None = Field(default=None)
+
+    # Inspection
+    inspection_result: dict[str, Any] | None = Field(default=None)
+    overall_pass: bool | None = Field(default=None)
+    blocking_issues: list[str] = Field(default_factory=list)
+    recommendations: list[str] = Field(default_factory=list)
+
+    # Lineage
+    parent_attempt_id: str | None = Field(
+        default=None,
+        description="Previous attempt this was derived from",
+    )
+    changes_from_parent: str | None = Field(
+        default=None,
+        description="What changed from the parent attempt",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Final deliverable
+# ---------------------------------------------------------------------------
+
+class ShippedRoutine(BaseModel):
+    """A routine that passed inspection and is ready for use."""
+
+    routine_spec_id: str = Field(description="Which spec this fulfills")
+    routine_json: dict[str, Any] = Field(description="The final routine")
+    name: str = Field(description="Routine name")
+    description: str = Field(description="What it does")
+    when_to_use: str = Field(description="Guidance for the user on when to use this routine")
+    parameters_summary: list[str] = Field(
+        default_factory=list,
+        description="Human-readable parameter descriptions",
+    )
+    inspection_score: int = Field(description="Inspector's final score (0-100)")
+
+
+class RoutineCatalog(BaseModel):
+    """The final output of the entire pipeline — a catalog of shipped routines."""
+
+    site: str = Field(description="Primary domain e.g. 'premierleague.com'")
+    user_task: str = Field(description="The original user task")
+    routines: list[ShippedRoutine] = Field(
+        default_factory=list,
+        description="All shipped routines",
+    )
+    usage_guide: str = Field(
+        default="",
+        description="How to use these routines together",
+    )
+    failed_routines: list[dict[str, Any]] = Field(
+        default_factory=list,
+        description="Routines we couldn't build + why",
+    )
+    total_experiments: int = Field(default=0)
+    total_attempts: int = Field(default=0)
+
+
+# ---------------------------------------------------------------------------
+# The Ledger — everything the PI needs to see
+# ---------------------------------------------------------------------------
+
+class DiscoveryLedger(BaseModel):
+    """
+    Pipeline-level tracker for the PI's entire API indexing lifecycle.
+
+    Tracks routine specs, experiments, proven artifacts, routine attempts,
+    and the final catalog. A compact view is rendered into the PI's
+    system prompt every iteration so the PI can track progress.
+    """
+
+    # Context
+    user_task: str = Field(description="The user's original task description")
+
+    # Planning — what routines to build
+    routine_specs: list[RoutineSpec] = Field(default_factory=list)
+    active_spec_id: str | None = Field(
+        default=None,
+        description="Which routine the PI is currently working on",
+    )
+
+    # Experiments (shared pool across all routines)
+    experiments: list[ExperimentEntry] = Field(default_factory=list)
+    proven: ProvenArtifacts = Field(default_factory=ProvenArtifacts)
+    unresolved: list[str] = Field(
+        default_factory=list,
+        description="Questions we still don't have answers to",
+    )
+
+    # Routine attempts (across all specs)
+    attempts: list[RoutineAttempt] = Field(default_factory=list)
+
+    # Final output
+    catalog: RoutineCatalog | None = Field(
+        default=None,
+        description="Built when PI calls mark_complete",
+    )
+
+    # -----------------------------------------------------------------------
+    # Convenience methods — specs
+    # -----------------------------------------------------------------------
+
+    def add_spec(self, spec: RoutineSpec) -> RoutineSpec:
+        """Add a routine spec to the catalog plan."""
+        self.routine_specs.append(spec)
+        return spec
+
+    def get_spec(self, spec_id: str) -> RoutineSpec | None:
+        """Find a routine spec by ID."""
+        for spec in self.routine_specs:
+            if spec.id == spec_id:
+                return spec
+        return None
+
+    def get_active_spec(self) -> RoutineSpec | None:
+        """Get the currently active routine spec."""
+        if self.active_spec_id is None:
+            return None
+        return self.get_spec(self.active_spec_id)
+
+    # -----------------------------------------------------------------------
+    # Convenience methods — experiments
+    # -----------------------------------------------------------------------
+
+    def add_experiment(self, experiment: ExperimentEntry) -> ExperimentEntry:
+        """Add an experiment to the shared pool."""
+        self.experiments.append(experiment)
+        if experiment.routine_spec_id:
+            spec = self.get_spec(experiment.routine_spec_id)
+            if spec:
+                spec.experiment_ids.append(experiment.id)
+        return experiment
+
+    def get_experiment(self, experiment_id: str) -> ExperimentEntry | None:
+        """Find an experiment by ID."""
+        for exp in self.experiments:
+            if exp.id == experiment_id:
+                return exp
+        return None
+
+    def get_running_experiments(self) -> list[ExperimentEntry]:
+        """Get all currently running experiments."""
+        return [e for e in self.experiments if e.status == ExperimentStatus.RUNNING]
+
+    def get_confirmed_experiments(self) -> list[ExperimentEntry]:
+        """Get all confirmed experiments."""
+        return [e for e in self.experiments if e.verdict == ExperimentVerdict.CONFIRMED]
+
+    def get_experiments_for_spec(self, spec_id: str) -> list[ExperimentEntry]:
+        """Get experiments for a specific routine spec."""
+        spec = self.get_spec(spec_id)
+        if spec is None:
+            return []
+        return [e for e in self.experiments if e.id in spec.experiment_ids]
+
+    # -----------------------------------------------------------------------
+    # Convenience methods — attempts
+    # -----------------------------------------------------------------------
+
+    def add_attempt(self, attempt: RoutineAttempt) -> RoutineAttempt:
+        """Add a routine attempt."""
+        self.attempts.append(attempt)
+        spec = self.get_spec(attempt.routine_spec_id)
+        if spec:
+            spec.attempt_ids.append(attempt.id)
+        return attempt
+
+    def get_attempt(self, attempt_id: str) -> RoutineAttempt | None:
+        """Find a routine attempt by ID."""
+        for attempt in self.attempts:
+            if attempt.id == attempt_id:
+                return attempt
+        return None
+
+    def get_attempts_for_spec(self, spec_id: str) -> list[RoutineAttempt]:
+        """Get all attempts for a specific routine spec."""
+        return [a for a in self.attempts if a.routine_spec_id == spec_id]
+
+    # -----------------------------------------------------------------------
+    # Summary rendering for system prompt
+    # -----------------------------------------------------------------------
+
+    def to_summary(self) -> str:
+        """Render a compact view of the ledger for the PI's system prompt."""
+        lines: list[str] = []
+
+        # Routine catalog plan
+        if self.routine_specs:
+            lines.append("## Routine Catalog Plan")
+            for i, spec in enumerate(self.routine_specs, 1):
+                status_badge = f"[{spec.status.value}]"
+                spec_attempts = self.get_attempts_for_spec(spec.id)
+                attempt_count = len(spec_attempts)
+                latest_failed = (
+                    spec_attempts[-1]
+                    if spec_attempts and spec_attempts[-1].status == RoutineAttemptStatus.FAILED
+                    else None
+                )
+                suffix = ""
+                if attempt_count > 0:
+                    suffix = f" ({attempt_count} attempt{'s' if attempt_count > 1 else ''})"
+                lines.append(f"  {i}. {status_badge:16s} {spec.name} — \"{spec.description}\"{suffix}")
+                # Surface the latest failure reason inline so the PI sees it at a glance
+                if latest_failed and latest_failed.blocking_issues:
+                    lines.append(f"      ⚠ LAST FAILURE: {latest_failed.blocking_issues[0]}")
+            if self.active_spec_id:
+                active = self.get_active_spec()
+                if active:
+                    lines.append(f"\n  ACTIVE: {active.name}")
+                    spec_exps = self.get_experiments_for_spec(active.id)
+                    confirmed = sum(1 for e in spec_exps if e.verdict == ExperimentVerdict.CONFIRMED)
+                    lines.append(
+                        f"    Experiments: {len(spec_exps)} run ({confirmed} confirmed)"
+                    )
+                    active_attempts = self.get_attempts_for_spec(active.id)
+                    if active_attempts:
+                        latest = active_attempts[-1]
+                        lines.append(
+                            f"    Latest attempt: {latest.id} [{latest.status.value}]"
+                        )
+                        if latest.blocking_issues:
+                            lines.append("    BLOCKING ISSUES (from inspector):")
+                            for issue in latest.blocking_issues:
+                                lines.append(f"      - {issue}")
+                        if latest.recommendations:
+                            lines.append("    RECOMMENDATIONS (from inspector):")
+                            for rec in latest.recommendations:
+                                lines.append(f"      - {rec}")
+                        # Show attempt history count so PI knows how many tries are left
+                        if len(active_attempts) > 1:
+                            failed_count = sum(
+                                1 for a in active_attempts
+                                if a.status == RoutineAttemptStatus.FAILED
+                            )
+                            lines.append(
+                                f"    Attempt history: {len(active_attempts)} total, "
+                                f"{failed_count} failed"
+                            )
+            lines.append("")
+
+        # Shipped routines
+        shipped = [s for s in self.routine_specs if s.status == RoutineSpecStatus.SHIPPED]
+        if shipped:
+            lines.append("## Shipped Routines")
+            for spec in shipped:
+                attempt = self.get_attempt(spec.shipped_attempt_id) if spec.shipped_attempt_id else None
+                score = ""
+                if attempt and attempt.inspection_result:
+                    score = f" (score: {attempt.inspection_result.get('overall_score', '?')}/100)"
+                lines.append(f"  {spec.name}{score}")
+            lines.append("")
+
+        # Experiment history
+        if self.experiments:
+            lines.append("## Experiment History")
+            for exp in self.experiments:
+                verdict_icon = {
+                    ExperimentVerdict.CONFIRMED: "+",
+                    ExperimentVerdict.REFUTED: "x",
+                    ExperimentVerdict.PARTIAL: "~",
+                    ExperimentVerdict.NEEDS_FOLLOWUP: "?",
+                }.get(exp.verdict, " ") if exp.verdict else " "
+                lines.append(f"  [{verdict_icon}] {exp.id}: {exp.hypothesis}")
+                if exp.summary:
+                    lines.append(f"      → {exp.summary}")
+                if exp.takeaways:
+                    preview = exp.takeaways[0].claim.strip()
+                    preview = preview[:100] + ("..." if len(preview) > 100 else "")
+                    lines.append(
+                        f"      → takeaways: {len(exp.takeaways)} (example: {preview})"
+                    )
+            lines.append("")
+
+        # Proven artifacts
+        proven = self.proven
+        if proven.fetches or proven.navigations or proven.tokens or proven.parameters:
+            lines.append("## Proven Artifacts")
+            for f in proven.fetches:
+                lines.append(f"  FETCH: {f.get('method', '?')} {f.get('url', '?')}")
+            for n in proven.navigations:
+                lines.append(f"  NAV: {n.get('url', '?')}")
+            for t in proven.tokens:
+                lines.append(f"  TOKEN: {t.get('name', '?')} from {t.get('source', '?')}")
+            for p in proven.parameters:
+                lines.append(f"  PARAM: {p.get('name', '?')} ({p.get('type', '?')})")
+            lines.append("")
+
+        # Unresolved
+        if self.unresolved:
+            lines.append("## Unresolved Questions")
+            for q in self.unresolved:
+                lines.append(f"  - {q}")
+            lines.append("")
+
+        return "\n".join(lines) if lines else "(no activity yet)"
diff --git a/bluebox/data_models/orchestration/task.py b/bluebox/data_models/orchestration/task.py
index 36779572..13976edb 100644
--- a/bluebox/data_models/orchestration/task.py
+++ b/bluebox/data_models/orchestration/task.py
@@ -48,10 +48,13 @@ class TaskStatus(StrEnum):
 
 class SpecialistAgentType(StrEnum):
     """Types of specialist agents available for task delegation."""
+    DOM_SPECIALIST = "dom_specialist"
     JS_SPECIALIST = "js_specialist"
     NETWORK_SPECIALIST = "network_specialist"
     VALUE_TRACE_RESOLVER = "value_trace_resolver"
     INTERACTION_SPECIALIST = "interaction_specialist"
+    EXPERIMENT_WORKER = "experiment_worker"
+    ROUTINE_INSPECTOR = "routine_inspector"
 
 
 class Task(BaseModel):
@@ -61,8 +64,7 @@ class Task(BaseModel):
     Tasks track the full lifecycle from creation through completion,
     including pause/resume support for long-running operations.
 
-    Note: agent_type values must match AbstractSpecialist.AGENT_TYPE on specialist classes.
-    Use AbstractSpecialist.get_all_agent_types() for runtime discovery of valid types.
+    Note: agent_type values should align with the autonomous agent class used for execution.
     """
     id: str = Field(default_factory=generate_short_id)
     agent_type: SpecialistAgentType = Field(description="Type of specialist to handle this task")
diff --git a/bluebox/llms/data_loaders/dom_data_loader.py b/bluebox/llms/data_loaders/dom_data_loader.py
new file mode 100644
index 00000000..515bddfe
--- /dev/null
+++ b/bluebox/llms/data_loaders/dom_data_loader.py
@@ -0,0 +1,1078 @@
+"""
+bluebox/llms/data_loaders/dom_data_loader.py
+
+Data loader for DOM snapshot analysis.
+
+Parses JSONL files with DOMSnapshotEvent entries and provides
+structured access to page structure data. DOM snapshots capture
+full page state on load events, including the document tree and
+all text content via a string interning table.
+"""
+
+import fnmatch
+from collections import Counter
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+from urllib.parse import urlparse
+
+from bluebox.data_models.dom import DOMSnapshotEvent
+from bluebox.llms.data_loaders.abstract_data_loader import AbstractDataLoader
+from bluebox.utils.data_utils import read_jsonl
+from bluebox.utils.logger import get_logger
+
+logger = get_logger(name=__name__)
+
+# Tag sets for element classification
+FORM_INPUT_TAGS = {"INPUT", "SELECT", "TEXTAREA"}
+FORM_CONTAINER_TAGS = {"FORM"}
+BUTTON_TAGS = {"BUTTON"}
+LINK_TAGS = {"A"}
+TABLE_TAGS = {"TABLE", "THEAD", "TBODY", "TR", "TH", "TD"}
+LIST_TAGS = {"UL", "OL", "LI", "DL", "DT", "DD"}
+HEADING_TAGS = {"H1", "H2", "H3", "H4", "H5", "H6"}
+META_TAGS = {"META"}
+SCRIPT_TAGS = {"SCRIPT"}
+
+
+@dataclass
+class DOMStats:
+    """Summary statistics for DOM snapshot data."""
+
+    total_snapshots: int = 0
+    unique_urls: int = 0
+    unique_titles: int = 0
+    urls: list[str] = field(default_factory=list)
+    titles: list[str] = field(default_factory=list)
+    avg_string_count: float = 0.0
+    avg_document_count: float = 0.0
+    total_strings: int = 0
+    hosts: dict[str, int] = field(default_factory=dict)
+
+    def to_summary(self) -> str:
+        """Generate a human-readable summary."""
+        lines = [
+            f"Total Snapshots: {self.total_snapshots}",
+            f"Unique URLs: {self.unique_urls}",
+            f"Unique Titles: {self.unique_titles}",
+            f"Total Strings Across All Snapshots: {self.total_strings}",
+            f"Avg Strings per Snapshot: {self.avg_string_count:.0f}",
+            f"Avg Documents per Snapshot: {self.avg_document_count:.1f}",
+            "",
+            "Pages Captured:",
+        ]
+        for i, url in enumerate(self.urls):
+            title = self.titles[i] if i < len(self.titles) else "N/A"
+            lines.append(f"  [{i}] {title} — {url}")
+
+        if self.hosts:
+            lines.append("")
+            lines.append("Hosts:")
+            for host, count in sorted(self.hosts.items(), key=lambda x: -x[1]):
+                lines.append(f"  {host}: {count} snapshot(s)")
+
+        return "\n".join(lines)
+
+
+class DOMDataLoader(AbstractDataLoader[DOMSnapshotEvent, DOMStats]):
+    """
+    Data loader for DOM snapshots.
+
+    Provides access to page structure captured via CDP DOMSnapshot.captureSnapshot.
+    Each entry is a full page snapshot taken on page load, containing the document
+    tree structure and a string interning table with all text content.
+    """
+
+    def __init__(self, jsonl_path: str) -> None:
+        """
+        Initialize the DOMDataLoader from a JSONL file.
+
+        Args:
+            jsonl_path: Path to JSONL file containing DOMSnapshotEvent entries.
+
+        Raises:
+            FileNotFoundError: If jsonl_path does not exist.
+        """
+        if not Path(jsonl_path).exists():
+            raise FileNotFoundError(f"DOM data file not found: {jsonl_path}")
+
+        self._entries: list[DOMSnapshotEvent] = []
+        self._stats: DOMStats = DOMStats()
+
+        for _line_num, data in read_jsonl(jsonl_path):
+            try:
+                event = DOMSnapshotEvent.model_validate(data)
+                self._entries.append(event)
+            except ValueError as e:
+                logger.warning("Failed to validate DOM snapshot: %s", e)
+                continue
+
+        self._compute_stats()
+
+        logger.debug(
+            "DOMDataLoader initialized with %d snapshots",
+            len(self._entries),
+        )
+
+    # Abstract method implementations
+
+    def get_entry_id(self, entry: DOMSnapshotEvent) -> str:
+        """Get unique identifier for a DOM snapshot (uses index)."""
+        return str(self._entries.index(entry))
+
+    def get_searchable_content(self, entry: DOMSnapshotEvent) -> str | None:
+        """
+        Get searchable content from a DOM snapshot.
+
+        Uses the string interning table which contains all text content,
+        node names, attribute values, etc.
+        """
+        if not entry.strings:
+            return None
+        return " ".join(entry.strings)
+
+    def get_entry_url(self, entry: DOMSnapshotEvent) -> str | None:
+        """Get URL associated with a DOM snapshot."""
+        return entry.url
+
+    # Private methods
+
+    def _compute_stats(self) -> None:
+        """Compute summary statistics from snapshots."""
+        urls: list[str] = []
+        titles: list[str] = []
+        unique_urls: set[str] = set()
+        unique_titles: set[str] = set()
+        hosts: Counter[str] = Counter()
+        total_strings = 0
+        total_documents = 0
+
+        for entry in self._entries:
+            urls.append(entry.url)
+            titles.append(entry.title or "")
+            unique_urls.add(entry.url)
+            if entry.title:
+                unique_titles.add(entry.title)
+            host = urlparse(entry.url).netloc
+            hosts[host] += 1
+            total_strings += len(entry.strings)
+            total_documents += len(entry.documents)
+
+        n = len(self._entries) or 1
+        self._stats = DOMStats(
+            total_snapshots=len(self._entries),
+            unique_urls=len(unique_urls),
+            unique_titles=len(unique_titles),
+            urls=urls,
+            titles=titles,
+            avg_string_count=total_strings / n,
+            avg_document_count=total_documents / n,
+            total_strings=total_strings,
+            hosts=dict(hosts),
+        )
+
+    # Public methods — Snapshot retrieval
+
+    def get_snapshot(self, index: int) -> DOMSnapshotEvent | None:
+        """
+        Get a snapshot by index.
+
+        Args:
+            index: Zero-based index into the snapshots list.
+
+        Returns:
+            The DOMSnapshotEvent, or None if index is out of range.
+        """
+        if index < 0 or index >= len(self._entries):
+            return None
+        return self._entries[index]
+
+    def get_snapshots_by_url(self, url: str) -> list[DOMSnapshotEvent]:
+        """
+        Get all snapshots for a given URL (substring match).
+
+        Args:
+            url: URL substring to match.
+
+        Returns:
+            List of matching DOMSnapshotEvent objects.
+        """
+        return [e for e in self._entries if url in e.url]
+
+    def get_snapshots_by_url_pattern(self, pattern: str) -> list[DOMSnapshotEvent]:
+        """
+        Get all snapshots whose URLs match a glob pattern.
+
+        Args:
+            pattern: Glob pattern to match URLs (e.g., "*example.com*", "*/search*").
+
+        Returns:
+            List of matching DOMSnapshotEvent objects.
+        """
+        return [e for e in self._entries if fnmatch.fnmatch(e.url, pattern)]
+
+    def get_snapshots_by_host(self, host: str) -> list[DOMSnapshotEvent]:
+        """
+        Get all snapshots from a specific host.
+
+        Args:
+            host: Hostname to filter by (exact match on netloc).
+
+        Returns:
+            List of matching DOMSnapshotEvent objects.
+        """
+        return [e for e in self._entries if urlparse(e.url).netloc == host]
+
+    # Public methods — Page-level summaries
+
+    def get_page_titles(self) -> list[dict[str, Any]]:
+        """
+        Get all captured page titles with their URLs.
+
+        Returns:
+            List of dicts with index, url, and title.
+        """
+        return [
+            {"index": idx, "url": entry.url, "title": entry.title}
+            for idx, entry in enumerate(self._entries)
+        ]
+
+    def list_pages(self) -> list[dict[str, Any]]:
+        """
+        List all captured pages with summary info.
+
+        Returns:
+            List of dicts with index, url, title, string_count,
+            document_count, and timestamp.
+        """
+        return [
+            {
+                "index": idx,
+                "url": entry.url,
+                "title": entry.title,
+                "string_count": len(entry.strings),
+                "document_count": len(entry.documents),
+                "timestamp": entry.timestamp,
+            }
+            for idx, entry in enumerate(self._entries)
+        ]
+
+    # Public methods — String table analysis
+
+    def get_text_content(self, index: int, max_chars: int = 5000) -> str | None:
+        """
+        Get the text content from a snapshot's string table.
+
+        Filters out short strings (likely node/attribute names) and returns
+        the longer strings which are more likely to be visible text content.
+
+        Args:
+            index: Zero-based snapshot index.
+            max_chars: Maximum characters to return.
+
+        Returns:
+            Concatenated text content, or None if index is invalid.
+        """
+        snapshot = self.get_snapshot(index)
+        if snapshot is None:
+            return None
+
+        # Filter to strings that are likely visible text (not tag names, short attrs)
+        text_strings = [s for s in snapshot.strings if len(s) > 3 and not s.startswith("#")]
+        content = "\n".join(text_strings)
+
+        if len(content) > max_chars:
+            content = content[:max_chars] + "..."
+
+        return content
+
+    def search_strings(
+        self,
+        value: str,
+        case_sensitive: bool = False,
+        snapshot_index: int | None = None,
+    ) -> list[dict[str, Any]]:
+        """
+        Search the string tables across all snapshots for a value.
+
+        Args:
+            value: The string to search for (substring match).
+            case_sensitive: Whether the search is case-sensitive.
+            snapshot_index: If provided, only search this specific snapshot.
+
+        Returns:
+            List of dicts with snapshot_index, url, matching_strings (list of
+            matching strings from the table), and match_count.
+        """
+        if not value:
+            return []
+
+        search_value = value if case_sensitive else value.lower()
+        results: list[dict[str, Any]] = []
+
+        entries_to_search: list[tuple[int, DOMSnapshotEvent]] = []
+        if snapshot_index is not None:
+            snapshot = self.get_snapshot(snapshot_index)
+            if snapshot:
+                entries_to_search = [(snapshot_index, snapshot)]
+        else:
+            entries_to_search = list(enumerate(self._entries))
+
+        for idx, entry in entries_to_search:
+            matching_strings: list[str] = []
+            for s in entry.strings:
+                compare_s = s if case_sensitive else s.lower()
+                if search_value in compare_s:
+                    matching_strings.append(s)
+
+            if matching_strings:
+                results.append({
+                    "snapshot_index": idx,
+                    "url": entry.url,
+                    "matching_strings": matching_strings[:50],  # cap to avoid huge results
+                    "match_count": len(matching_strings),
+                })
+
+        return results
+
+    # Private methods — Node tree walking
+
+    def _resolve_string(self, strings: list[str], index: int) -> str | None:
+        """Resolve a string index to its value, or None if invalid."""
+        if 0 <= index < len(strings):
+            return strings[index]
+        return None
+
+    def _get_node_attrs(self, nodes: dict[str, Any], node_index: int, strings: list[str]) -> dict[str, str]:
+        """Get attributes for a node as a name→value dict."""
+        attrs_list = nodes.get("attributes", [])[node_index]
+        result: dict[str, str] = {}
+        for j in range(0, len(attrs_list), 2):
+            if j + 1 < len(attrs_list):
+                name = self._resolve_string(strings, attrs_list[j])
+                val = self._resolve_string(strings, attrs_list[j + 1])
+                if name is not None and val is not None:
+                    result[name] = val
+        return result
+
+    def _get_input_value(self, nodes: dict[str, Any], node_index: int, strings: list[str]) -> str | None:
+        """Get the inputValue for a node, if any."""
+        input_val_data = nodes.get("inputValue", {})
+        indices = input_val_data.get("index", [])
+        values = input_val_data.get("value", [])
+        if node_index in indices:
+            val_idx = values[indices.index(node_index)]
+            return self._resolve_string(strings, val_idx)
+        return None
+
+    def _is_clickable(self, nodes: dict[str, Any], node_index: int) -> bool:
+        """Check if a node is marked as clickable."""
+        return node_index in nodes.get("isClickable", {}).get("index", [])
+
+    def _get_child_text(
+        self,
+        nodes: dict[str, Any],
+        parent_index: int,
+        strings: list[str],
+    ) -> str | None:
+        """
+        Get concatenated text content from direct child #text nodes.
+
+        Args:
+            nodes: The nodes dict from the document.
+            parent_index: Index of the parent node.
+            strings: The string interning table.
+
+        Returns:
+            Concatenated text, or None if no text children found.
+        """
+        node_names = nodes.get("nodeName", [])
+        parent_indices = nodes.get("parentIndex", [])
+        node_values = nodes.get("nodeValue", [])
+
+        text_parts: list[str] = []
+        for j, child_name_idx in enumerate(node_names):
+            if parent_indices[j] == parent_index:
+                child_tag = self._resolve_string(strings, child_name_idx)
+                if child_tag == "#text":
+                    val_idx = node_values[j] if j < len(node_values) else -1
+                    text = self._resolve_string(strings, val_idx)
+                    if text:
+                        text_parts.append(text)
+
+        return " ".join(text_parts) if text_parts else None
+
+    def _get_entries_to_search(
+        self, snapshot_index: int | None,
+    ) -> list[tuple[int, DOMSnapshotEvent]]:
+        """Get the list of (index, snapshot) pairs to search."""
+        if snapshot_index is not None:
+            snapshot = self.get_snapshot(snapshot_index)
+            if snapshot:
+                return [(snapshot_index, snapshot)]
+            return []
+        return list(enumerate(self._entries))
+
+    def _extract_elements_by_tag(
+        self,
+        entry: DOMSnapshotEvent,
+        tag_names: set[str],
+        doc_index: int = 0,
+    ) -> list[dict[str, Any]]:
+        """
+        Walk the node tree for a document and extract elements matching tag names.
+
+        Args:
+            entry: The snapshot to search.
+            tag_names: Set of uppercase tag names to match.
+            doc_index: Which document in the snapshot (0 = main frame).
+
+        Returns:
+            List of element dicts with tag, node_index, attrs, input_value, is_clickable.
+        """
+        if doc_index >= len(entry.documents):
+            return []
+
+        doc = entry.documents[doc_index]
+        nodes = doc.get("nodes", {})
+        node_names = nodes.get("nodeName", [])
+        strings = entry.strings
+        results: list[dict[str, Any]] = []
+
+        for i, name_idx in enumerate(node_names):
+            tag = self._resolve_string(strings, name_idx)
+            if tag is None or tag.upper() not in tag_names:
+                continue
+
+            attrs = self._get_node_attrs(nodes, i, strings)
+            input_value = self._get_input_value(nodes, i, strings)
+            clickable = self._is_clickable(nodes, i)
+
+            element: dict[str, Any] = {
+                "tag": tag.upper(),
+                "node_index": i,
+                "attrs": attrs,
+                "is_clickable": clickable,
+            }
+            if input_value is not None:
+                element["input_value"] = input_value
+
+            results.append(element)
+
+        return results
+
+    # Public methods — Unified element extraction
+
+    # Maps element_type strings to (tag_set, post_filter, output_key)
+    ELEMENT_TYPE_MAP: dict[str, set[str]] = {
+        "inputs": FORM_INPUT_TAGS,       # INPUT, SELECT, TEXTAREA
+        "buttons": BUTTON_TAGS,           # BUTTON (+ INPUT type=submit/button via post-filter)
+        "links": LINK_TAGS,              # A
+        "headings": HEADING_TAGS,        # H1-H6
+        "meta_tags": META_TAGS,          # META
+        "hidden_inputs": frozenset({"INPUT"}),  # INPUT (filtered to type=hidden)
+    }
+
+    VALID_ELEMENT_TYPES: list[str] = [
+        "inputs", "buttons", "links", "headings",
+        "meta_tags", "hidden_inputs", "clickable",
+    ]
+
+    def get_elements(
+        self,
+        element_type: str,
+        snapshot_index: int | None = None,
+    ) -> list[dict[str, Any]]:
+        """
+        Unified element extraction — get elements of a given type from snapshots.
+
+        Replaces the individual get_inputs/get_buttons/get_links/etc. methods
+        with a single entry point.
+
+        Args:
+            element_type: One of 'inputs', 'buttons', 'links', 'headings',
+                          'meta_tags', 'hidden_inputs', 'clickable'.
+            snapshot_index: If provided, only search this specific snapshot.
+
+        Returns:
+            List of dicts with snapshot_index, url, and elements.
+
+        Raises:
+            ValueError: If element_type is not recognized.
+        """
+        if element_type not in self.VALID_ELEMENT_TYPES:
+            raise ValueError(
+                f"Unknown element_type: {element_type!r}. "
+                f"Valid types: {self.VALID_ELEMENT_TYPES}"
+            )
+
+        # Special case: clickable uses the isClickable index, not tag matching
+        if element_type == "clickable":
+            return self.get_clickable_elements(snapshot_index)
+
+        tag_set = self.ELEMENT_TYPE_MAP[element_type]
+        results: list[dict[str, Any]] = []
+
+        for idx, entry in self._get_entries_to_search(snapshot_index):
+            elements = self._extract_elements_by_tag(entry, tag_set)
+
+            # Post-filters for special types
+            if element_type == "buttons":
+                # Also include <input type="submit"> and <input type="button">
+                inputs = self._extract_elements_by_tag(entry, {"INPUT"})
+                for inp in inputs:
+                    input_type = inp["attrs"].get("type", "").lower()
+                    if input_type in ("submit", "button"):
+                        elements.append(inp)
+
+            elif element_type == "hidden_inputs":
+                elements = [
+                    el for el in elements
+                    if el["attrs"].get("type", "").lower() == "hidden"
+                ]
+
+            elif element_type == "meta_tags":
+                # Simplify: meta tags don't need is_clickable or input_value
+                elements = [
+                    {"node_index": el["node_index"], "attrs": el["attrs"]}
+                    for el in elements
+                ]
+
+            elif element_type == "headings":
+                # Headings need child text extraction
+                if not entry.documents:
+                    continue
+                doc = entry.documents[0]
+                nodes = doc.get("nodes", {})
+                elements = [
+                    {
+                        "tag": el["tag"],
+                        "node_index": el["node_index"],
+                        "text": self._get_child_text(nodes, el["node_index"], entry.strings),
+                    }
+                    for el in elements
+                ]
+
+            if elements:
+                results.append({
+                    "snapshot_index": idx,
+                    "url": entry.url,
+                    "elements": elements,
+                })
+
+        return results
+
+    # Public methods — Element extraction (tree-walking)
+    # NOTE: Legacy per-type methods kept for backward compatibility.
+    # Prefer get_elements(element_type, ...) for new code.
+
+    def get_inputs(self, snapshot_index: int | None = None) -> list[dict[str, Any]]:
+        """
+        Get all input fields (INPUT, SELECT, TEXTAREA) from snapshots.
+
+        Walks the actual node tree — not just the string table — to extract
+        real elements with their attributes, types, names, and current values.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot.
+
+        Returns:
+            List of dicts with snapshot_index, url, and elements. Each element
+            has tag, attrs (name, type, placeholder, value, etc.), input_value,
+            and is_clickable.
+        """
+        results: list[dict[str, Any]] = []
+        for idx, entry in self._get_entries_to_search(snapshot_index):
+            elements = self._extract_elements_by_tag(entry, FORM_INPUT_TAGS)
+            if elements:
+                results.append({
+                    "snapshot_index": idx,
+                    "url": entry.url,
+                    "elements": elements,
+                })
+        return results
+
+    def get_buttons(self, snapshot_index: int | None = None) -> list[dict[str, Any]]:
+        """
+        Get all buttons from snapshots.
+
+        Includes both <button> elements and <input type="submit">.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot.
+
+        Returns:
+            List of dicts with snapshot_index, url, and elements.
+        """
+        results: list[dict[str, Any]] = []
+        for idx, entry in self._get_entries_to_search(snapshot_index):
+            buttons = self._extract_elements_by_tag(entry, BUTTON_TAGS)
+
+            # Also include <input type="submit"> and <input type="button">
+            inputs = self._extract_elements_by_tag(entry, {"INPUT"})
+            for inp in inputs:
+                input_type = inp["attrs"].get("type", "").lower()
+                if input_type in ("submit", "button"):
+                    buttons.append(inp)
+
+            if buttons:
+                results.append({
+                    "snapshot_index": idx,
+                    "url": entry.url,
+                    "elements": buttons,
+                })
+        return results
+
+    def get_links(self, snapshot_index: int | None = None) -> list[dict[str, Any]]:
+        """
+        Get all anchor links (<a>) from snapshots with their href values.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot.
+
+        Returns:
+            List of dicts with snapshot_index, url, and elements. Each element
+            has tag, attrs (including href), and is_clickable.
+        """
+        results: list[dict[str, Any]] = []
+        for idx, entry in self._get_entries_to_search(snapshot_index):
+            elements = self._extract_elements_by_tag(entry, LINK_TAGS)
+            if elements:
+                results.append({
+                    "snapshot_index": idx,
+                    "url": entry.url,
+                    "elements": elements,
+                })
+        return results
+
+    def get_forms(self, snapshot_index: int | None = None) -> list[dict[str, Any]]:
+        """
+        Get all <form> elements with their action, method, and child inputs.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot.
+
+        Returns:
+            List of dicts with snapshot_index, url, and forms. Each form has
+            attrs (action, method, id) and a list of child input elements.
+        """
+        results: list[dict[str, Any]] = []
+        for idx, entry in self._get_entries_to_search(snapshot_index):
+            if not entry.documents:
+                continue
+            doc = entry.documents[0]
+            nodes = doc.get("nodes", {})
+            node_names = nodes.get("nodeName", [])
+            parent_indices = nodes.get("parentIndex", [])
+            strings = entry.strings
+
+            forms: list[dict[str, Any]] = []
+
+            # Find all FORM nodes
+            form_node_indices: list[int] = []
+            for i, name_idx in enumerate(node_names):
+                tag = self._resolve_string(strings, name_idx)
+                if tag and tag.upper() == "FORM":
+                    form_node_indices.append(i)
+
+            for form_idx in form_node_indices:
+                form_attrs = self._get_node_attrs(nodes, form_idx, strings)
+
+                # Find child input elements (walk descendants)
+                child_inputs: list[dict[str, Any]] = []
+                for i, name_idx in enumerate(node_names):
+                    tag = self._resolve_string(strings, name_idx)
+                    if tag is None or tag.upper() not in FORM_INPUT_TAGS | BUTTON_TAGS:
+                        continue
+
+                    # Check if this node is a descendant of the form
+                    current = i
+                    is_descendant = False
+                    depth = 0
+                    while current != -1 and depth < 100:
+                        if current == form_idx:
+                            is_descendant = True
+                            break
+                        current = parent_indices[current] if current < len(parent_indices) else -1
+                        depth += 1
+
+                    if is_descendant:
+                        attrs = self._get_node_attrs(nodes, i, strings)
+                        input_value = self._get_input_value(nodes, i, strings)
+                        element: dict[str, Any] = {
+                            "tag": tag.upper(),
+                            "node_index": i,
+                            "attrs": attrs,
+                        }
+                        if input_value is not None:
+                            element["input_value"] = input_value
+                        child_inputs.append(element)
+
+                forms.append({
+                    "node_index": form_idx,
+                    "attrs": form_attrs,
+                    "inputs": child_inputs,
+                })
+
+            if forms:
+                results.append({
+                    "snapshot_index": idx,
+                    "url": entry.url,
+                    "forms": forms,
+                })
+        return results
+
+    def get_tables(self, snapshot_index: int | None = None) -> list[dict[str, Any]]:
+        """
+        Get all <table> elements with their headers and row counts.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot.
+
+        Returns:
+            List of dicts with snapshot_index, url, and tables. Each table has
+            node_index, attrs, headers (list of header text), and row_count.
+        """
+        results: list[dict[str, Any]] = []
+        for idx, entry in self._get_entries_to_search(snapshot_index):
+            if not entry.documents:
+                continue
+            doc = entry.documents[0]
+            nodes = doc.get("nodes", {})
+            node_names = nodes.get("nodeName", [])
+            parent_indices = nodes.get("parentIndex", [])
+            node_values = nodes.get("nodeValue", [])
+            strings = entry.strings
+
+            tables: list[dict[str, Any]] = []
+
+            # Find TABLE nodes
+            for i, name_idx in enumerate(node_names):
+                tag = self._resolve_string(strings, name_idx)
+                if tag is None or tag.upper() != "TABLE":
+                    continue
+
+                table_attrs = self._get_node_attrs(nodes, i, strings)
+
+                # Find TH and TR descendants
+                headers: list[str] = []
+                row_count = 0
+
+                for j, child_name_idx in enumerate(node_names):
+                    child_tag = self._resolve_string(strings, child_name_idx)
+                    if child_tag is None:
+                        continue
+
+                    # Check if descendant of this table
+                    current = j
+                    is_descendant = False
+                    depth = 0
+                    while current != -1 and depth < 100:
+                        if current == i:
+                            is_descendant = True
+                            break
+                        current = parent_indices[current] if current < len(parent_indices) else -1
+                        depth += 1
+
+                    if not is_descendant:
+                        continue
+
+                    if child_tag.upper() == "TR":
+                        row_count += 1
+                    elif child_tag.upper() == "TH":
+                        # Get text content of the TH (look for text node children)
+                        for k, grand_name_idx in enumerate(node_names):
+                            if parent_indices[k] == j:
+                                grand_tag = self._resolve_string(strings, grand_name_idx)
+                                if grand_tag == "#text":
+                                    val_idx = node_values[k] if k < len(node_values) else -1
+                                    text = self._resolve_string(strings, val_idx)
+                                    if text:
+                                        headers.append(text)
+
+                tables.append({
+                    "node_index": i,
+                    "attrs": table_attrs,
+                    "headers": headers,
+                    "row_count": row_count,
+                })
+
+            if tables:
+                results.append({
+                    "snapshot_index": idx,
+                    "url": entry.url,
+                    "tables": tables,
+                })
+        return results
+
+    def get_headings(self, snapshot_index: int | None = None) -> list[dict[str, Any]]:
+        """
+        Get all heading elements (H1-H6) with their text content.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot.
+
+        Returns:
+            List of dicts with snapshot_index, url, and headings. Each heading
+            has tag (H1-H6), node_index, and text.
+        """
+        results: list[dict[str, Any]] = []
+        for idx, entry in self._get_entries_to_search(snapshot_index):
+            if not entry.documents:
+                continue
+            doc = entry.documents[0]
+            nodes = doc.get("nodes", {})
+            node_names = nodes.get("nodeName", [])
+            strings = entry.strings
+
+            headings: list[dict[str, Any]] = []
+
+            for i, name_idx in enumerate(node_names):
+                tag = self._resolve_string(strings, name_idx)
+                if tag is None or tag.upper() not in HEADING_TAGS:
+                    continue
+
+                headings.append({
+                    "tag": tag.upper(),
+                    "node_index": i,
+                    "text": self._get_child_text(nodes, i, strings),
+                })
+
+            if headings:
+                results.append({
+                    "snapshot_index": idx,
+                    "url": entry.url,
+                    "headings": headings,
+                })
+        return results
+
+    def get_clickable_elements(self, snapshot_index: int | None = None) -> list[dict[str, Any]]:
+        """
+        Get all elements marked as clickable by the browser.
+
+        CDP marks elements as clickable based on event listeners and
+        default behavior. This catches buttons, links, and any element
+        with click handlers.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot.
+
+        Returns:
+            List of dicts with snapshot_index, url, and elements. Each element
+            has tag, node_index, and attrs.
+        """
+        results: list[dict[str, Any]] = []
+        for idx, entry in self._get_entries_to_search(snapshot_index):
+            if not entry.documents:
+                continue
+            doc = entry.documents[0]
+            nodes = doc.get("nodes", {})
+            node_names = nodes.get("nodeName", [])
+            clickable_indices = set(nodes.get("isClickable", {}).get("index", []))
+            strings = entry.strings
+
+            elements: list[dict[str, Any]] = []
+            for i in clickable_indices:
+                if i >= len(node_names):
+                    continue
+                tag = self._resolve_string(strings, node_names[i])
+                if tag is None or tag.startswith("#"):
+                    continue  # Skip #document, #text, etc.
+
+                attrs = self._get_node_attrs(nodes, i, strings)
+                elements.append({
+                    "tag": tag.upper(),
+                    "node_index": i,
+                    "attrs": attrs,
+                })
+
+            if elements:
+                results.append({
+                    "snapshot_index": idx,
+                    "url": entry.url,
+                    "elements": elements,
+                })
+        return results
+
+    def get_meta_tags(self, snapshot_index: int | None = None) -> list[dict[str, Any]]:
+        """
+        Get all <meta> elements with their attributes.
+
+        Meta tags carry CSRF tokens, API endpoints, OG tags, verification
+        keys, viewport config, and other page metadata.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot.
+
+        Returns:
+            List of dicts with snapshot_index, url, and meta_tags. Each meta
+            tag has node_index and attrs (name, content, property, charset, etc.).
+        """
+        results: list[dict[str, Any]] = []
+        for idx, entry in self._get_entries_to_search(snapshot_index):
+            elements = self._extract_elements_by_tag(entry, META_TAGS)
+            if elements:
+                # Simplify: meta tags don't need is_clickable or input_value
+                meta_tags = [
+                    {"node_index": el["node_index"], "attrs": el["attrs"]}
+                    for el in elements
+                ]
+                results.append({
+                    "snapshot_index": idx,
+                    "url": entry.url,
+                    "meta_tags": meta_tags,
+                })
+        return results
+
+    def get_scripts(
+        self,
+        snapshot_index: int | None = None,
+        max_inline_chars: int = 2000,
+    ) -> list[dict[str, Any]]:
+        """
+        Get all <script> elements with their attributes and inline content.
+
+        Extracts both external scripts (src attribute) and inline scripts
+        (text content). Inline content is truncated to max_inline_chars.
+
+        Useful for finding:
+        - Framework data blobs: __NEXT_DATA__, __NUXT__, window.__INITIAL_STATE__
+        - Embedded JSON: <script type="application/json">
+        - Inline configuration: GTM, analytics, consent management
+        - Structured data: <script type="application/ld+json">
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot.
+            max_inline_chars: Max characters for inline script content (default 2000).
+
+        Returns:
+            List of dicts with snapshot_index, url, and scripts. Each script
+            has node_index, attrs, and inline_content (if any).
+        """
+        results: list[dict[str, Any]] = []
+        for idx, entry in self._get_entries_to_search(snapshot_index):
+            if not entry.documents:
+                continue
+            doc = entry.documents[0]
+            nodes = doc.get("nodes", {})
+            node_names = nodes.get("nodeName", [])
+            strings = entry.strings
+
+            scripts: list[dict[str, Any]] = []
+
+            for i, name_idx in enumerate(node_names):
+                tag = self._resolve_string(strings, name_idx)
+                if tag is None or tag.upper() != "SCRIPT":
+                    continue
+
+                attrs = self._get_node_attrs(nodes, i, strings)
+                inline_content = self._get_child_text(nodes, i, strings)
+
+                script: dict[str, Any] = {
+                    "node_index": i,
+                    "attrs": attrs,
+                }
+
+                if inline_content:
+                    if len(inline_content) > max_inline_chars:
+                        script["inline_content"] = inline_content[:max_inline_chars] + "..."
+                        script["inline_content_truncated"] = True
+                        script["inline_content_full_length"] = len(inline_content)
+                    else:
+                        script["inline_content"] = inline_content
+
+                scripts.append(script)
+
+            if scripts:
+                results.append({
+                    "snapshot_index": idx,
+                    "url": entry.url,
+                    "scripts": scripts,
+                })
+        return results
+
+    def get_hidden_inputs(self, snapshot_index: int | None = None) -> list[dict[str, Any]]:
+        """
+        Get all <input type="hidden"> elements.
+
+        Hidden inputs often carry CSRF tokens, session IDs, form tokens,
+        and other security-relevant values that are submitted with forms.
+
+        Args:
+            snapshot_index: If provided, only search this specific snapshot.
+
+        Returns:
+            List of dicts with snapshot_index, url, and elements. Each element
+            has tag, node_index, attrs (name, value, etc.), and input_value.
+        """
+        results: list[dict[str, Any]] = []
+        for idx, entry in self._get_entries_to_search(snapshot_index):
+            all_inputs = self._extract_elements_by_tag(entry, {"INPUT"})
+            hidden = [
+                el for el in all_inputs
+                if el["attrs"].get("type", "").lower() == "hidden"
+            ]
+            if hidden:
+                results.append({
+                    "snapshot_index": idx,
+                    "url": entry.url,
+                    "elements": hidden,
+                })
+        return results
+
+    def get_snapshot_diff(self, index_a: int, index_b: int) -> dict[str, Any] | None:
+        """
+        Compare two snapshots and return strings that were added or removed.
+
+        Useful for understanding what changed between page loads.
+
+        Args:
+            index_a: Index of the first (earlier) snapshot.
+            index_b: Index of the second (later) snapshot.
+
+        Returns:
+            Dict with added (strings in B but not A), removed (strings in A
+            but not B), and shared count. None if either index is invalid.
+        """
+        snapshot_a = self.get_snapshot(index_a)
+        snapshot_b = self.get_snapshot(index_b)
+
+        if snapshot_a is None or snapshot_b is None:
+            return None
+
+        strings_a = set(snapshot_a.strings)
+        strings_b = set(snapshot_b.strings)
+
+        added = sorted(strings_b - strings_a)
+        removed = sorted(strings_a - strings_b)
+
+        return {
+            "snapshot_a": {"index": index_a, "url": snapshot_a.url},
+            "snapshot_b": {"index": index_b, "url": snapshot_b.url},
+            "added": added,
+            "removed": removed,
+            "added_count": len(added),
+            "removed_count": len(removed),
+            "shared_count": len(strings_a & strings_b),
+        }
+
+    def get_navigation_sequence(self) -> list[dict[str, Any]]:
+        """
+        Get the ordered sequence of page navigations from snapshots.
+
+        Returns:
+            List of dicts with index, url, title, host, and timestamp,
+            ordered by timestamp.
+        """
+        sequence = []
+        for idx, entry in enumerate(self._entries):
+            sequence.append({
+                "index": idx,
+                "url": entry.url,
+                "title": entry.title,
+                "host": urlparse(entry.url).netloc,
+                "timestamp": entry.timestamp,
+            })
+
+        sequence.sort(key=lambda x: x["timestamp"])
+        return sequence
diff --git a/bluebox/llms/data_loaders/network_data_loader.py b/bluebox/llms/data_loaders/network_data_loader.py
index b561edfd..fcdf00ed 100644
--- a/bluebox/llms/data_loaders/network_data_loader.py
+++ b/bluebox/llms/data_loaders/network_data_loader.py
@@ -354,11 +354,15 @@ def search_entries_by_terms(
         """
         Search entries by a list of terms and rank by relevance.
 
-        For each entry, searches the response body for each term and computes:
+        For each entry, searches the URL, request headers, and response body
+        for each term and computes:
         - unique_terms_found: how many different terms were found
-        - total_hits: total number of term matches across all terms
+        - total_hits: total number of term matches across all searchable fields
         - score: (total_hits / num_terms) * unique_terms_found
 
+        URL matches are boosted (×3) since URL path searches are the most
+        common use case for transaction discovery.
+
         Args:
             terms: List of search terms (case-insensitive).
             top_n: Number of top results to return.
@@ -375,20 +379,29 @@ def search_entries_by_terms(
             return results
 
         for entry in self._entries:
-            if not entry.response_body:
-                continue
+            # Build searchable text from multiple fields
+            url_lower = entry.url.lower() if entry.url else ""
+            headers_lower = str(entry.request_headers).lower() if entry.request_headers else ""
+            body_lower = entry.response_body.lower() if entry.response_body else ""
 
-            content_lower = entry.response_body.lower()
+            # Skip entries with no searchable content
+            if not url_lower and not body_lower:
+                continue
 
-            # Count hits for each term
+            # Count hits for each term across all fields
             unique_terms_found = 0
             total_hits = 0
 
             for term in terms_lower:
-                count = content_lower.count(term)
-                if count > 0:
+                url_count = url_lower.count(term)
+                headers_count = headers_lower.count(term)
+                body_count = body_lower.count(term)
+
+                # Boost URL matches since URL path queries are the primary use case
+                term_total = (url_count * 3) + headers_count + body_count
+                if term_total > 0:
                     unique_terms_found += 1
-                    total_hits += count
+                    total_hits += term_total
 
             # Skip entries with no hits
             if unique_terms_found == 0:
diff --git a/bluebox/scripts/agent_http_adapter.py b/bluebox/scripts/agent_http_adapter.py
index ab5fe510..7f52571f 100644
--- a/bluebox/scripts/agent_http_adapter.py
+++ b/bluebox/scripts/agent_http_adapter.py
@@ -1,8 +1,8 @@
 """
 Generic HTTP adapter for Bluebox agents.
 
-Works with any agent extending AbstractAgent, including all AbstractSpecialist
-subclasses (auto-discovered at runtime via the specialist registry).
+Works with any agent extending AbstractAgent, including autonomous agents
+auto-discovered from `bluebox.agents.specialists`.
 
 Uses inspect.signature to auto-wire each agent's constructor params to the
 available data loaders, handling the _loader/_store naming split transparently.
@@ -18,7 +18,7 @@
     GET  /status
     POST /chat          {"message": "..."}
 
-Agents with discovery support (specialists + RoutineDiscoveryAgentBeta):
+Agents with discovery support:
     POST /discover      {"task": "..."}
     GET  /routine
 """
@@ -40,8 +40,9 @@
 
 from pydantic import BaseModel
 
+import bluebox.agents.specialists as specialists_pkg
 from bluebox.agents.abstract_agent import AbstractAgent
-from bluebox.agents.specialists.abstract_specialist import AbstractSpecialist
+from bluebox.agents.bluebox_agent import BlueBoxAgent
 from bluebox.config import Config
 from bluebox.data_models.llms.interaction import (
     BaseEmittedMessage,
@@ -52,6 +53,7 @@
 )
 from bluebox.data_models.llms.vendors import OpenAIModel
 from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader
+from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader
 from bluebox.llms.data_loaders.interactions_data_loader import InteractionsDataLoader
 from bluebox.llms.data_loaders.js_data_loader import JSDataLoader
 from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
@@ -64,6 +66,7 @@
 
 # Maps constructor param names → canonical data loader keys.
 _DATA_PARAM_TO_KEY: dict[str, str] = {
+    "dom_data_loader": "dom",
     "network_data_loader": "network",
     "storage_data_loader": "storage",
     "window_property_data_loader": "window_property",
@@ -78,22 +81,16 @@
 
 def discover_agent_classes() -> dict[str, type]:
     """Build registry of all available AbstractAgent subclasses by class name."""
-    from bluebox.agents.routine_discovery_agent_beta import RoutineDiscoveryAgentBeta
-    from bluebox.agents.bluebox_agent import BlueBoxAgent
-
     # Import all specialist modules to trigger __init_subclass__ registration
-    import bluebox.agents.specialists as specialists_pkg
     for _, module_name, _ in pkgutil.iter_modules(specialists_pkg.__path__):
         importlib.import_module(f"bluebox.agents.specialists.{module_name}")
 
     registry: dict[str, type] = {
-        "RoutineDiscoveryAgentBeta": RoutineDiscoveryAgentBeta,
         "BlueBoxAgent": BlueBoxAgent,
     }
-    for name in AbstractSpecialist.get_all_agent_types():
-        cls = AbstractSpecialist.get_by_type(name)
-        if cls is not None:
-            registry[name] = cls
+    for cls in AbstractAgent.get_all_subclasses():
+        if cls.__module__.startswith("bluebox.agents.specialists."):
+            registry[cls.__name__] = cls
     return registry
 
 
@@ -248,21 +245,18 @@ def chat(self, message: str) -> dict[str, Any]:
         with self._lock:
             if not self._chat_agent:
                 extra: dict[str, Any] = {}
-                # RoutineDiscoveryAgentBeta requires a task constructor param
-                if _accepts_param(self._agent_class, "task"):
-                    extra["task"] = "Help the user understand their data and answer questions."
-                self._chat_agent = self._make_agent(**extra)
+                self._chat_agent = self._make_agent()
             self._flush()
             self._chat_agent.process_new_message(message, ChatRole.USER)
             return {"ok": True, "messages": self._flush()}
 
     def discover(self, task: str) -> dict[str, Any]:
-        """Run discovery. Specialists use run_autonomous(), others use run()."""
+        """Run discovery. Prefer autonomous runs when the agent exposes run_autonomous()."""
         with self._lock:
             self._flush()
 
-            # Specialists: run_autonomous(task)
-            if issubclass(self._agent_class, AbstractSpecialist):
+            # Autonomous-capable agents: run_autonomous(task)
+            if callable(getattr(self._agent_class, "run_autonomous", None)):
                 agent = self._make_agent()
                 result = agent.run_autonomous(task)
                 messages = self._flush()
@@ -272,7 +266,7 @@ def discover(self, task: str) -> dict[str, Any]:
                     return {"ok": True, "result": result_data, "messages": messages}
                 return {"ok": False, "error": "Autonomous run finished without result", "messages": messages}
 
-            # Non-specialist agents with their own run() (e.g. RoutineDiscoveryAgentBeta)
+            # Non-specialist agents with their own run()
             if _has_own_method(self._agent_class, "run"):
                 extra: dict[str, Any] = {}
                 if _accepts_param(self._agent_class, "task"):
@@ -327,7 +321,7 @@ def get_routine(self) -> dict[str, Any]:
     @property
     def supports_discover(self) -> bool:
         return (
-            issubclass(self._agent_class, AbstractSpecialist)
+            callable(getattr(self._agent_class, "run_autonomous", None))
             or _has_own_method(self._agent_class, "run")
         )
 
@@ -402,6 +396,7 @@ def log_message(self, format: str, *args: Any) -> None:
 def load_data(args: argparse.Namespace) -> dict[str, Any]:
     """Load all available CDP data loaders keyed by canonical names."""
     paths: dict[str, str | None] = {
+        "dom": args.dom_jsonl,
         "network": args.network_jsonl,
         "storage": args.storage_jsonl,
         "window_property": args.window_props_jsonl,
@@ -412,6 +407,7 @@ def load_data(args: argparse.Namespace) -> dict[str, Any]:
     if args.cdp_captures_dir:
         cdp_dir = Path(args.cdp_captures_dir)
         candidates = {
+            "dom": cdp_dir / "dom" / "events.jsonl",
             "network": cdp_dir / "network" / "events.jsonl",
             "storage": cdp_dir / "storage" / "events.jsonl",
             "window_property": cdp_dir / "window_properties" / "events.jsonl",
@@ -424,6 +420,7 @@ def load_data(args: argparse.Namespace) -> dict[str, Any]:
 
     loaders: dict[str, Any] = {}
     factories: dict[str, Any] = {
+        "dom": lambda p: DOMDataLoader(p),
         "network": lambda p: NetworkDataLoader(p),
         "storage": lambda p: StorageDataLoader(p),
         "window_property": lambda p: WindowPropertyDataLoader(p),
@@ -470,11 +467,12 @@ def main() -> None:
     registry = discover_agent_classes()
 
     parser = argparse.ArgumentParser(description="HTTP adapter for Bluebox agents")
-    parser.add_argument("--agent", default="RoutineDiscoveryAgentBeta",
-                        help="Agent class name (default: RoutineDiscoveryAgentBeta)")
+    parser.add_argument("--agent", default="BlueBoxAgent",
+                        help="Agent class name (default: BlueBoxAgent)")
     parser.add_argument("--list-agents", action="store_true",
                         help="List available agents and exit")
     parser.add_argument("--cdp-captures-dir", type=str, default=None)
+    parser.add_argument("--dom-jsonl", type=str, default=None)
     parser.add_argument("--network-jsonl", type=str, default=None)
     parser.add_argument("--storage-jsonl", type=str, default=None)
     parser.add_argument("--window-props-jsonl", type=str, default=None)
@@ -496,9 +494,8 @@ def main() -> None:
             for pname, param in sig.parameters.items():
                 if pname in _DATA_PARAM_TO_KEY and param.default is inspect.Parameter.empty:
                     required.append(_DATA_PARAM_TO_KEY[pname])
-            specialist = " (specialist)" if issubclass(cls, AbstractSpecialist) else ""
             req = f"  requires: {', '.join(required)}" if required else ""
-            print(f"  {name}{specialist}{req}")
+            print(f"  {name}{req}")
         return
 
     if args.agent not in registry:
diff --git a/bluebox/scripts/api_indexing/__init__.py b/bluebox/scripts/api_indexing/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/bluebox/scripts/api_indexing/analyze_pipeline_output.py b/bluebox/scripts/api_indexing/analyze_pipeline_output.py
new file mode 100644
index 00000000..43aa620e
--- /dev/null
+++ b/bluebox/scripts/api_indexing/analyze_pipeline_output.py
@@ -0,0 +1,654 @@
+"""
+bluebox/scripts/api_indexing/analyze_pipeline_output.py
+
+Analyze API indexing pipeline output: per-agent tool call traces and usage stats.
+
+Reads the agent_threads/ directory from a pipeline run and produces a summary
+showing each agent's tool call trace (ordered) and a counter of how many times
+each tool was called.
+
+Usage:
+    python -m bluebox.scripts.api_indexing.analyze_pipeline_output \
+        --output-dir ./api_indexing_output
+
+    python -m bluebox.scripts.api_indexing.analyze_pipeline_output \
+        --output-dir ./api_indexing_output --json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import Any
+
+
+def _load_agent_thread(path: Path) -> dict[str, Any]:
+    """Load a single agent thread JSON file."""
+    return json.loads(path.read_text())
+
+
+def _extract_tool_calls(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Extract ordered tool calls from an agent's message history."""
+    calls: list[dict[str, Any]] = []
+    for msg in messages:
+        if msg.get("role") == "assistant" and msg.get("tool_calls"):
+            for tc in msg["tool_calls"]:
+                calls.append({
+                    "tool_name": tc["tool_name"],
+                    "call_id": tc.get("call_id", ""),
+                    "arguments": tc.get("arguments", {}),
+                })
+    return calls
+
+
+def _extract_tool_results(messages: list[dict[str, Any]]) -> dict[str, str]:
+    """Map call_id -> truncated result content for tool responses."""
+    results: dict[str, str] = {}
+    for msg in messages:
+        if msg.get("role") == "tool" and msg.get("tool_call_id"):
+            content = msg.get("content", "")
+            # Truncate long results
+            if len(content) > 200:
+                content = content[:200] + "..."
+            results[msg["tool_call_id"]] = content
+    return results
+
+
+# ---------------------------------------------------------------------------
+# Overflow / persistence detection
+# ---------------------------------------------------------------------------
+
+_OVERFLOW_KEYWORDS = [
+    ("overflow", "persist_mode: overflow"),
+    ("always_persist", "persist_mode: always"),
+    ("output_too_large", "output too large"),
+    ("body_truncated", "response_body_truncated: true"),
+    ("result_truncated", "truncated: true"),
+]
+
+
+def _classify_overflow(content: str) -> dict[str, Any] | None:
+    """Detect if a tool result was overflowed / persisted / truncated.
+
+    Returns a dict with classification details, or None if nothing notable.
+    """
+    # Skip false positives: response_body_truncated: false
+    for kind, marker in _OVERFLOW_KEYWORDS:
+        if marker not in content:
+            continue
+        # Guard: "truncated: true" can appear as "response_body_truncated: true"
+        # which we already handle with the body_truncated kind
+        if kind == "result_truncated" and "response_body_truncated" in content:
+            # Only flag if there's a standalone "truncated: true" outside of
+            # "response_body_truncated"
+            stripped = content.replace("response_body_truncated: true", "").replace(
+                "response_body_truncated: false", ""
+            )
+            if "truncated: true" not in stripped:
+                continue
+
+        info: dict[str, Any] = {"kind": kind}
+
+        # Extract artifact path if present
+        if "artifact_path:" in content:
+            idx = content.index("artifact_path:")
+            line = content[idx:].split("\\n")[0].split("\n")[0]
+            info["artifact_path"] = line.split(":", 1)[1].strip().rstrip('"')
+
+        # Extract artifact_id if present
+        if "artifact_id:" in content:
+            idx = content.index("artifact_id:")
+            line = content[idx:].split("\\n")[0].split("\n")[0]
+            info["artifact_id"] = line.split(":", 1)[1].strip().rstrip('"')
+
+        return info
+
+    return None
+
+
+def _extract_overflows(
+    messages: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """Find all tool results that overflowed / were persisted / truncated.
+
+    Returns a list of dicts, each with: call_id, tool_name, kind, and optional
+    artifact_path / artifact_id.
+    """
+    # Build call_id -> tool_name map from assistant messages
+    call_id_to_tool: dict[str, str] = {}
+    call_id_to_args: dict[str, dict[str, Any]] = {}
+    for msg in messages:
+        if msg.get("role") == "assistant" and msg.get("tool_calls"):
+            for tc in msg["tool_calls"]:
+                cid = tc.get("call_id", "")
+                call_id_to_tool[cid] = tc["tool_name"]
+                call_id_to_args[cid] = tc.get("arguments", {})
+
+    overflows: list[dict[str, Any]] = []
+    for msg in messages:
+        if msg.get("role") != "tool":
+            continue
+        content = msg.get("content", "")
+        info = _classify_overflow(content)
+        if info is None:
+            continue
+
+        call_id = msg.get("tool_call_id", "")
+        info["call_id"] = call_id
+        info["tool_name"] = call_id_to_tool.get(call_id, "?")
+        info["arguments"] = call_id_to_args.get(call_id, {})
+        overflows.append(info)
+
+    return overflows
+
+
+# ---------------------------------------------------------------------------
+# PI delegation analysis
+# ---------------------------------------------------------------------------
+
+
+def _analyze_pi_delegations(messages: list[dict[str, Any]]) -> dict[str, Any] | None:
+    """Analyze PI's experiment dispatches: output contract usage, etc.
+
+    Only applicable to principal_investigator agents. Returns None for others.
+    """
+    dispatched_experiments: list[dict[str, Any]] = []
+    for msg in messages:
+        if msg.get("role") != "assistant" or not msg.get("tool_calls"):
+            continue
+        for tc in msg["tool_calls"]:
+            if tc["tool_name"] not in ("dispatch_experiments_batch", "dispatch_experiment"):
+                continue
+            args = tc.get("arguments", {})
+            # dispatch_experiments_batch has a list; dispatch_experiment is a single
+            if tc["tool_name"] == "dispatch_experiments_batch":
+                experiments = args.get("experiments", [])
+            else:
+                experiments = [args]
+
+            for exp in experiments:
+                dispatched_experiments.append({
+                    "hypothesis": exp.get("hypothesis", ""),
+                    "has_output_description": bool(exp.get("output_description")),
+                    "output_description": exp.get("output_description", ""),
+                })
+
+    if not dispatched_experiments:
+        return None
+
+    total = len(dispatched_experiments)
+    with_description = sum(1 for e in dispatched_experiments if e["has_output_description"])
+
+    return {
+        "total_experiments_dispatched": total,
+        "with_output_description": with_description,
+        "description_usage_rate": f"{with_description / total * 100:.0f}%" if total else "N/A",
+        "experiments": dispatched_experiments,
+    }
+
+
+def analyze_agent(thread_data: dict[str, Any]) -> dict[str, Any]:
+    """Analyze a single agent thread and return structured stats."""
+    label = thread_data["agent_label"]
+    messages = thread_data["messages"]
+    message_count = thread_data["message_count"]
+
+    tool_calls = _extract_tool_calls(messages)
+    tool_results = _extract_tool_results(messages)
+    overflows = _extract_overflows(messages)
+
+    # Count tool usage
+    tool_counter: Counter[str] = Counter()
+    for tc in tool_calls:
+        tool_counter[tc["tool_name"]] += 1
+
+    # Build trace with results
+    trace: list[dict[str, Any]] = []
+    for i, tc in enumerate(tool_calls):
+        entry: dict[str, Any] = {
+            "step": i + 1,
+            "tool_name": tc["tool_name"],
+            "call_id": tc["call_id"],
+            "arguments": tc["arguments"],
+        }
+        result = tool_results.get(tc["call_id"])
+        if result:
+            entry["result_preview"] = result
+        trace.append(entry)
+
+    result: dict[str, Any] = {
+        "agent_label": label,
+        "message_count": message_count,
+        "total_tool_calls": len(tool_calls),
+        "tool_usage_counts": dict(tool_counter.most_common()),
+        "unique_tools_used": len(tool_counter),
+        "tool_call_trace": trace,
+        "overflows": overflows,
+    }
+
+    # PI-specific delegation analysis
+    if label.startswith("principal"):
+        delegations = _analyze_pi_delegations(messages)
+        if delegations:
+            result["pi_delegations"] = delegations
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Ledger analysis
+# ---------------------------------------------------------------------------
+
+
+def _analyze_ledger(output_dir: Path) -> dict[str, Any] | None:
+    """Analyze the discovery ledger for spec/experiment/attempt stats."""
+    ledger_path = output_dir / "ledger.json"
+    if not ledger_path.exists():
+        return None
+
+    data = json.loads(ledger_path.read_text())
+
+    # --- Specs ---
+    specs = data.get("routine_specs", [])
+    specs_by_status: Counter[str] = Counter()
+    spec_details: list[dict[str, Any]] = []
+    for spec in specs:
+        specs_by_status[spec["status"]] += 1
+        spec_attempts = [a for a in data.get("attempts", []) if a["routine_spec_id"] == spec["id"]]
+        passed_attempts = [a for a in spec_attempts if a.get("overall_pass")]
+        failed_attempts = [a for a in spec_attempts if a.get("status") == "failed"]
+
+        detail: dict[str, Any] = {
+            "id": spec["id"],
+            "name": spec["name"],
+            "description": spec["description"],
+            "status": spec["status"],
+            "priority": spec["priority"],
+            "total_attempts": len(spec_attempts),
+            "passed_attempts": len(passed_attempts),
+            "failed_attempts": len(failed_attempts),
+            "shipped_attempt_id": spec.get("shipped_attempt_id"),
+            "failure_reason": spec.get("failure_reason"),
+        }
+
+        # Collect blocking issues from failed attempts
+        all_blocking: list[str] = []
+        for att in failed_attempts:
+            all_blocking.extend(att.get("blocking_issues", []))
+        if all_blocking:
+            detail["blocking_issues"] = all_blocking
+
+        spec_details.append(detail)
+
+    # --- Experiments ---
+    experiments = data.get("experiments", [])
+    exp_by_status: Counter[str] = Counter()
+    exp_by_verdict: Counter[str] = Counter()
+    for exp in experiments:
+        exp_by_status[exp["status"]] += 1
+        verdict = exp.get("verdict") or "none"
+        exp_by_verdict[verdict] += 1
+
+    # --- Attempts ---
+    attempts = data.get("attempts", [])
+    att_by_status: Counter[str] = Counter()
+    for att in attempts:
+        att_by_status[att["status"]] += 1
+
+    total_passed = sum(1 for a in attempts if a.get("overall_pass"))
+    total_failed = sum(1 for a in attempts if a.get("status") == "failed")
+
+    # --- Catalog ---
+    catalog = data.get("catalog")
+    catalog_summary: dict[str, Any] | None = None
+    if catalog:
+        catalog_summary = {
+            "site": catalog.get("site", ""),
+            "shipped_count": len(catalog.get("routines", [])),
+            "failed_count": len(catalog.get("failed_routines", [])),
+            "total_experiments": catalog.get("total_experiments", 0),
+            "total_attempts": catalog.get("total_attempts", 0),
+            "routine_names": [r["name"] for r in catalog.get("routines", [])],
+            "failed_names": [r.get("name", "?") for r in catalog.get("failed_routines", [])],
+            "usage_guide": catalog.get("usage_guide", ""),
+        }
+
+    return {
+        "user_task": data.get("user_task", ""),
+        "specs": {
+            "total": len(specs),
+            "by_status": dict(specs_by_status.most_common()),
+            "details": spec_details,
+        },
+        "experiments": {
+            "total": len(experiments),
+            "by_status": dict(exp_by_status.most_common()),
+            "by_verdict": dict(exp_by_verdict.most_common()),
+        },
+        "attempts": {
+            "total": len(attempts),
+            "by_status": dict(att_by_status.most_common()),
+            "passed": total_passed,
+            "failed": total_failed,
+            "pass_rate": f"{total_passed / len(attempts) * 100:.0f}%" if attempts else "N/A",
+        },
+        "catalog": catalog_summary,
+    }
+
+
+def analyze_pipeline_output(output_dir: Path) -> dict[str, Any]:
+    """Analyze all agent threads in a pipeline output directory."""
+    threads_dir = output_dir / "agent_threads"
+    if not threads_dir.exists():
+        print(f"Error: {threads_dir} does not exist", file=sys.stderr)
+        sys.exit(1)
+
+    thread_files = sorted(threads_dir.glob("*.json"))
+    if not thread_files:
+        print(f"Error: No agent thread files found in {threads_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    agents: list[dict[str, Any]] = []
+    global_tool_counter: Counter[str] = Counter()
+    total_tool_calls = 0
+
+    for tf in thread_files:
+        thread_data = _load_agent_thread(tf)
+        analysis = analyze_agent(thread_data)
+        agents.append(analysis)
+        total_tool_calls += analysis["total_tool_calls"]
+        for tool_name, count in analysis["tool_usage_counts"].items():
+            global_tool_counter[tool_name] += count
+
+    # Classify agents
+    pi_agents = [a for a in agents if a["agent_label"].startswith("principal")]
+    worker_agents = [a for a in agents if a["agent_label"].startswith("worker")]
+    inspector_agents = [a for a in agents if a["agent_label"].startswith("inspector")]
+
+    # Global overflow stats
+    global_overflow_counter: Counter[str] = Counter()  # tool_name -> count
+    global_overflow_kind_counter: Counter[str] = Counter()  # kind -> count
+    total_overflows = 0
+    for agent in agents:
+        for ov in agent.get("overflows", []):
+            global_overflow_counter[ov["tool_name"]] += 1
+            global_overflow_kind_counter[ov["kind"]] += 1
+            total_overflows += 1
+
+    # Ledger analysis
+    ledger_analysis = _analyze_ledger(output_dir)
+
+    return {
+        "output_dir": str(output_dir),
+        "total_agents": len(agents),
+        "total_tool_calls": total_tool_calls,
+        "global_tool_usage": dict(global_tool_counter.most_common()),
+        "total_overflows": total_overflows,
+        "global_overflow_by_tool": dict(global_overflow_counter.most_common()),
+        "global_overflow_by_kind": dict(global_overflow_kind_counter.most_common()),
+        "agent_breakdown": {
+            "principal_investigators": len(pi_agents),
+            "workers": len(worker_agents),
+            "inspectors": len(inspector_agents),
+        },
+        "ledger": ledger_analysis,
+        "agents": agents,
+    }
+
+
+def print_text_report(result: dict[str, Any]) -> None:
+    """Print a human-readable report to stdout."""
+    print(f"\n{'=' * 70}")
+    print(f"  Pipeline Output Analysis: {result['output_dir']}")
+    print(f"{'=' * 70}\n")
+
+    print(f"  Total agents:     {result['total_agents']}")
+    breakdown = result["agent_breakdown"]
+    print(f"    PI:             {breakdown['principal_investigators']}")
+    print(f"    Workers:        {breakdown['workers']}")
+    print(f"    Inspectors:     {breakdown['inspectors']}")
+    print(f"  Total tool calls: {result['total_tool_calls']}")
+
+    print(f"  Total overflows: {result['total_overflows']}")
+
+    print(f"\n{'─' * 70}")
+    print("  Global Tool Usage (all agents combined)")
+    print(f"{'─' * 70}")
+    for tool_name, count in result["global_tool_usage"].items():
+        print(f"    {tool_name:<40} {count:>5}x")
+
+    if result["total_overflows"] > 0:
+        print(f"\n{'─' * 70}")
+        print("  Global Overflows/Persisted Results")
+        print(f"{'─' * 70}")
+        print(f"\n  By tool:")
+        for tool_name, count in result["global_overflow_by_tool"].items():
+            print(f"    {tool_name:<40} {count:>5}x")
+        print(f"\n  By kind:")
+        for kind, count in result["global_overflow_by_kind"].items():
+            label = _overflow_kind_label(kind)
+            print(f"    {label:<40} {count:>5}x")
+
+    # Ledger analysis
+    ledger = result.get("ledger")
+    if ledger:
+        print(f"\n{'═' * 70}")
+        print("  Discovery Ledger")
+        print(f"{'═' * 70}")
+        print(f"\n  Task: {ledger['user_task']}")
+
+        # Specs
+        specs = ledger["specs"]
+        print(f"\n  {'─' * 60}")
+        print(f"  Routine Specs ({specs['total']})")
+        print(f"  {'─' * 60}")
+        print(f"    By status: {specs['by_status']}")
+        print()
+        for spec in specs["details"]:
+            status_icon = {
+                "shipped": "+", "failed": "x", "planned": " ",
+                "experimenting": "~", "assembling": "~", "validating": "~",
+            }.get(spec["status"], "?")
+            print(f"    [{status_icon}] {spec['name']}")
+            print(f"        status={spec['status']}  priority=P{spec['priority']}  "
+                  f"attempts={spec['total_attempts']} (passed={spec['passed_attempts']}, failed={spec['failed_attempts']})")
+            print(f"        {spec['description']}")
+            if spec.get("shipped_attempt_id"):
+                print(f"        shipped_attempt: {spec['shipped_attempt_id']}")
+            if spec.get("failure_reason"):
+                print(f"        failure: {spec['failure_reason']}")
+            if spec.get("blocking_issues"):
+                print(f"        blocking issues ({len(spec['blocking_issues'])}):")
+                for bi in spec["blocking_issues"]:
+                    issue_text = bi if len(bi) <= 100 else bi[:97] + "..."
+                    print(f"          - {issue_text}")
+            print()
+
+        # Experiments
+        exps = ledger["experiments"]
+        print(f"  {'─' * 60}")
+        print(f"  Experiments ({exps['total']})")
+        print(f"  {'─' * 60}")
+        print(f"    By status:  {exps['by_status']}")
+        print(f"    By verdict: {exps['by_verdict']}")
+
+        # Attempts
+        atts = ledger["attempts"]
+        print(f"\n  {'─' * 60}")
+        print(f"  Routine Attempts ({atts['total']})")
+        print(f"  {'─' * 60}")
+        print(f"    By status:  {atts['by_status']}")
+        print(f"    Passed: {atts['passed']}  Failed: {atts['failed']}  Pass rate: {atts['pass_rate']}")
+
+        # Catalog
+        cat = ledger.get("catalog")
+        if cat:
+            print(f"\n  {'─' * 60}")
+            print(f"  Final Catalog")
+            print(f"  {'─' * 60}")
+            print(f"    Site: {cat['site']}")
+            print(f"    Shipped: {cat['shipped_count']}  Failed: {cat['failed_count']}")
+            print(f"    Total experiments: {cat['total_experiments']}  Total attempts: {cat['total_attempts']}")
+            if cat["routine_names"]:
+                print(f"    Shipped routines:")
+                for name in cat["routine_names"]:
+                    print(f"      [+] {name}")
+            if cat["failed_names"]:
+                print(f"    Failed routines:")
+                for name in cat["failed_names"]:
+                    print(f"      [-] {name}")
+            if cat.get("usage_guide"):
+                guide = cat["usage_guide"]
+                if len(guide) > 300:
+                    guide = guide[:297] + "..."
+                print(f"    Usage guide: {guide}")
+
+    for agent in result["agents"]:
+        print(f"\n{'═' * 70}")
+        print(f"  Agent: {agent['agent_label']}")
+        print(f"{'═' * 70}")
+        print(f"  Messages: {agent['message_count']}  |  Tool calls: {agent['total_tool_calls']}  |  Unique tools: {agent['unique_tools_used']}")
+
+        print(f"\n  Tool Usage Counts:")
+        for tool_name, count in agent["tool_usage_counts"].items():
+            print(f"    {tool_name:<40} {count:>5}x")
+
+        overflows = agent.get("overflows", [])
+        if overflows:
+            print(f"\n  Overflows / Persisted Results ({len(overflows)}):")
+            for ov in overflows:
+                kind_label = _overflow_kind_label(ov["kind"])
+                args_summary = _summarize_args(ov.get("arguments", {}))
+                print(f"    {ov['tool_name']:<35} [{kind_label}]")
+                if args_summary:
+                    print(f"      args: {args_summary}")
+                if ov.get("artifact_path"):
+                    print(f"      artifact: {ov['artifact_path']}")
+
+        # PI delegation analysis
+        delegations = agent.get("pi_delegations")
+        if delegations:
+            print(f"\n  PI Experiment Delegations:")
+            print(f"    Total dispatched:       {delegations['total_experiments_dispatched']}")
+            print(f"    With output_description:{delegations['with_output_description']}  ({delegations['description_usage_rate']})")
+
+            print(f"\n    Experiments:")
+            for i, exp in enumerate(delegations["experiments"]):
+                hypothesis = exp["hypothesis"]
+                if len(hypothesis) > 90:
+                    hypothesis = hypothesis[:87] + "..."
+                desc_tag = "desc" if exp["has_output_description"] else "no desc"
+                print(f"      [{i + 1:>2}] [{desc_tag}] {hypothesis}")
+
+        if "tool_call_trace" in agent:
+            print(f"\n  Tool Call Trace:")
+            for entry in agent["tool_call_trace"]:
+                args_summary = _summarize_args(entry["arguments"])
+                # Mark overflowed calls with a tag
+                overflow_tag = ""
+                for ov in overflows:
+                    if ov.get("call_id") == entry.get("call_id", object()):
+                        overflow_tag = f" !! {ov['kind']}"
+                        break
+                print(f"    [{entry['step']:>3}] {entry['tool_name']}{overflow_tag}")
+                if args_summary:
+                    print(f"          args: {args_summary}")
+
+    print(f"\n{'=' * 70}")
+    print("  End of report")
+    print(f"{'=' * 70}\n")
+
+
+_OVERFLOW_KIND_LABELS = {
+    "overflow": "overflow (too large, saved to artifact)",
+    "always_persist": "always_persist (tool persists by default)",
+    "output_too_large": "output_too_large (read_file exceeded line limit)",
+    "body_truncated": "body_truncated (HTTP response body cut off)",
+    "result_truncated": "result_truncated (tool output cut at char limit)",
+}
+
+
+def _overflow_kind_label(kind: str) -> str:
+    """Human-readable label for an overflow kind."""
+    return _OVERFLOW_KIND_LABELS.get(kind, kind)
+
+
+def _summarize_args(args: dict[str, Any]) -> str:
+    """Create a short summary of tool call arguments."""
+    if not args:
+        return ""
+    parts: list[str] = []
+    for k, v in args.items():
+        v_str = str(v)
+        if len(v_str) > 80:
+            v_str = v_str[:77] + "..."
+        parts.append(f"{k}={v_str}")
+    summary = ", ".join(parts)
+    if len(summary) > 120:
+        summary = summary[:117] + "..."
+    return summary
+
+
+def main() -> None:
+    """CLI entrypoint."""
+    parser = argparse.ArgumentParser(
+        description="Analyze API indexing pipeline output: per-agent tool call traces and usage stats.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("./api_indexing_output"),
+        help="Path to pipeline output directory (default: ./api_indexing_output)",
+    )
+    parser.add_argument(
+        "--json",
+        action="store_true",
+        dest="json_output",
+        help="Output as JSON instead of human-readable text",
+    )
+    parser.add_argument(
+        "--agent",
+        type=str,
+        default=None,
+        help="Filter to a specific agent label (e.g., 'principal_investigator', 'worker_sxv8h9')",
+    )
+    parser.add_argument(
+        "--no-trace",
+        action="store_true",
+        help="Omit the full tool call trace (show only counts)",
+    )
+
+    args = parser.parse_args()
+
+    if not args.output_dir.exists():
+        print(f"Error: {args.output_dir} does not exist", file=sys.stderr)
+        sys.exit(1)
+
+    result = analyze_pipeline_output(args.output_dir)
+
+    # Filter to specific agent if requested
+    if args.agent:
+        matching = [a for a in result["agents"] if args.agent in a["agent_label"]]
+        if not matching:
+            print(f"Error: No agent matching '{args.agent}' found", file=sys.stderr)
+            available = [a["agent_label"] for a in result["agents"]]
+            print(f"Available: {', '.join(available)}", file=sys.stderr)
+            sys.exit(1)
+        result["agents"] = matching
+
+    # Optionally strip traces
+    if args.no_trace:
+        for agent in result["agents"]:
+            del agent["tool_call_trace"]
+
+    if args.json_output:
+        print(json.dumps(result, indent=2, default=str))
+    else:
+        print_text_report(result)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bluebox/scripts/api_indexing/run_api_indexing.py b/bluebox/scripts/api_indexing/run_api_indexing.py
new file mode 100644
index 00000000..c2cbd170
--- /dev/null
+++ b/bluebox/scripts/api_indexing/run_api_indexing.py
@@ -0,0 +1,754 @@
+"""
+bluebox/scripts/api_indexing/run_api_indexing.py
+
+End-to-end API indexing pipeline.
+
+Phase 1: Run 4 exploration specialists in parallel to analyze CDP captures
+Phase 2: Run the PrincipalInvestigator to build a catalog of routines
+
+Output is written incrementally to disk so every experiment, attempt, and
+routine is available for debugging even if the pipeline crashes mid-run.
+
+Usage:
+    python -m bluebox.scripts.api_indexing.run_api_indexing \
+        --cdp-captures-dir ./cdp_captures \
+        --task "Browse Premier League standings and view team details"
+
+    python -m bluebox.scripts.api_indexing.run_api_indexing \
+        --cdp-captures-dir ./cdp_captures \
+        --task "Search for flights" \
+        --skip-exploration \
+        --output-dir ./api_indexing_output
+"""
+
+from __future__ import annotations
+
+import argparse
+from contextlib import redirect_stdout
+from io import StringIO
+import json
+import logging
+import shutil
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from itertools import count
+from pathlib import Path
+from typing import Any
+
+from bluebox.agents.principal_investigator import PrincipalInvestigator
+from bluebox.cdp.connection import get_browser_websocket_url
+from bluebox.workspace import LocalAgentWorkspace
+from bluebox.data_models.api_indexing.exploration import (
+    DOMExplorationSummary,
+    NetworkExplorationSummary,
+    StorageExplorationSummary,
+    UIExplorationSummary,
+)
+from bluebox.data_models.llms.interaction import EmittedMessage
+from bluebox.data_models.llms.vendors import (
+    LLMModel,
+    get_all_model_values,
+    get_model_by_value,
+)
+from bluebox.data_models.orchestration.ledger import DiscoveryLedger, RoutineCatalog
+from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader
+from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader
+from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
+from bluebox.llms.data_loaders.storage_data_loader import StorageDataLoader
+from bluebox.llms.data_loaders.window_property_data_loader import WindowPropertyDataLoader
+from bluebox.scripts.api_indexing.run_dom_exploration import run_dom_exploration
+from bluebox.scripts.api_indexing.run_network_exploration import run_network_exploration
+from bluebox.scripts.api_indexing.run_storage_exploration import run_storage_exploration
+from bluebox.scripts.api_indexing.run_ui_exploration import run_ui_exploration
+from bluebox.scripts.api_indexing.analyze_pipeline_output import (
+    analyze_pipeline_output,
+    print_text_report,
+)
+from bluebox.utils.logger import get_logger
+
+logger = get_logger(name=__name__)
+
+BLUEBOX_PACKAGE_ROOT = Path(__file__).resolve().parent.parent.parent
+DEFAULT_MAX_PI_ATTEMPTS = 3
+DEFAULT_MODEL_VALUE = "gpt-5.2"
+
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _emit_message(msg: EmittedMessage) -> None:
+    """Print emitted messages to stderr."""
+    if hasattr(msg, "content") and msg.content:
+        print(f"[agent] {msg.content}", file=sys.stderr)
+    elif hasattr(msg, "error") and msg.error:
+        print(f"[error] {msg.error}", file=sys.stderr)
+
+
+def _load_if_exists(loader_cls: type, jsonl_path: Path) -> Any:
+    """Load a data loader if its JSONL file exists, else return None."""
+    if jsonl_path.exists():
+        return loader_cls(jsonl_path=str(jsonl_path))
+    logger.info("Skipping %s — %s not found", loader_cls.__name__, jsonl_path)
+    return None
+
+
+def _write_json(path: Path, data: Any) -> None:
+    """Write JSON to a file, creating parent directories as needed."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(data, indent=2, default=str))
+
+
+def _collect_capture_input_files(cdp_captures_dir: Path) -> list[tuple[str, Path]]:
+    """Return capture input files to mount into agent workspaces."""
+    candidates: list[tuple[str, Path]] = [
+        ("network_events", cdp_captures_dir / "network" / "events.jsonl"),
+        ("storage_events", cdp_captures_dir / "storage" / "events.jsonl"),
+        ("dom_events", cdp_captures_dir / "dom" / "events.jsonl"),
+        ("window_properties_events", cdp_captures_dir / "window_properties" / "events.jsonl"),
+        ("js_events", cdp_captures_dir / "js" / "events.jsonl"),
+        ("interaction_events", cdp_captures_dir / "interaction" / "events.jsonl"),
+    ]
+    return [(name, path) for name, path in candidates if path.exists()]
+
+
+def _mount_capture_inputs(workspace: LocalAgentWorkspace, capture_inputs: list[tuple[str, Path]]) -> None:
+    """Attach capture files into workspace raw/ as mounted inputs."""
+    for name, source_path in capture_inputs:
+        workspace.attach_input_file(name=name, source_path=source_path)
+
+
+def _run_post_run_analysis(output_dir: Path) -> Path | None:
+    """Run pipeline output analysis and write text report to analysis.txt."""
+    analysis_path = output_dir / "analysis.txt"
+    try:
+        result = analyze_pipeline_output(output_dir)
+        buf = StringIO()
+        with redirect_stdout(buf):
+            print_text_report(result)
+        analysis_path.write_text(buf.getvalue())
+        print(f"\n  Analysis report: {analysis_path}", file=sys.stderr)
+        return analysis_path
+    except SystemExit:
+        logger.warning("Post-run analysis skipped: insufficient pipeline artifacts in %s", output_dir)
+    except Exception as e:
+        logger.warning("Post-run analysis failed for %s: %s", output_dir, e)
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Phase 1: Exploration
+# ---------------------------------------------------------------------------
+
+
+def run_explorations(
+    cdp_captures_dir: Path,
+    output_dir: Path,
+    llm_model: LLMModel,
+) -> dict[str, str]:
+    """
+    Run all 4 exploration specialists in parallel.
+
+    Returns:
+        Dict mapping domain name → JSON summary string (for PI system prompt).
+    """
+    exploration_dir = output_dir / "exploration"
+    exploration_dir.mkdir(parents=True, exist_ok=True)
+    workspace_root = output_dir / "agent_workspaces"
+    workspace_root.mkdir(parents=True, exist_ok=True)
+
+    runners = {
+        "network": run_network_exploration,
+        "storage": run_storage_exploration,
+        "dom": run_dom_exploration,
+        "ui": run_ui_exploration,
+    }
+
+    summaries: dict[str, str] = {}
+
+    print("\n=== Phase 1: Exploration (4 domains in parallel) ===\n", file=sys.stderr)
+
+    with ThreadPoolExecutor(max_workers=4) as pool:
+        futures = {
+            pool.submit(
+                fn,
+                cdp_captures_dir,
+                llm_model,
+                workspace_dir=workspace_root / f"{domain}_exploration",
+            ): domain
+            for domain, fn in runners.items()
+        }
+
+        for future in as_completed(futures):
+            domain = futures[future]
+            try:
+                result = future.result()
+                if result is not None:
+                    summary_json = result.model_dump_json(indent=2)
+                    # Save to disk
+                    _write_json(exploration_dir / f"{domain}.json", json.loads(summary_json))
+                    summaries[domain] = summary_json
+                    print(f"  [+] {domain} exploration complete", file=sys.stderr)
+                else:
+                    print(f"  [-] {domain} exploration returned no result", file=sys.stderr)
+            except Exception as e:
+                logger.error("Exploration failed for %s: %s", domain, e)
+                print(f"  [!] {domain} exploration failed: {e}", file=sys.stderr)
+
+    print(f"\n  Explorations complete: {len(summaries)}/4 domains\n", file=sys.stderr)
+    return summaries
+
+
+def load_explorations(output_dir: Path) -> dict[str, str]:
+    """Load previously saved exploration summaries from disk."""
+    exploration_dir = output_dir / "exploration"
+    summaries: dict[str, str] = {}
+
+    model_map = {
+        "network": NetworkExplorationSummary,
+        "storage": StorageExplorationSummary,
+        "dom": DOMExplorationSummary,
+        "ui": UIExplorationSummary,
+    }
+
+    for domain, model_cls in model_map.items():
+        path = exploration_dir / f"{domain}.json"
+        if path.exists():
+            raw = json.loads(path.read_text())
+            # Validate it parses
+            model_cls(**raw)
+            summaries[domain] = json.dumps(raw, indent=2)
+            print(f"  [+] Loaded {domain} exploration from {path}", file=sys.stderr)
+        else:
+            print(f"  [-] No saved {domain} exploration at {path}", file=sys.stderr)
+
+    return summaries
+
+
+# ---------------------------------------------------------------------------
+# Phase 2: PI loop with incremental persistence
+# ---------------------------------------------------------------------------
+
+
+class PipelinePersistence:
+    """
+    Writes ledger state and agent threads to disk incrementally as the PI works.
+
+    Output structure:
+        output_dir/
+        ├── experiments/
+        │   ├── exp_abc123.json      # Each experiment as its own file
+        │   └── exp_def456.json
+        ├── attempt_records/
+        │   ├── get_standings_attempt_1_abc12345.json   # Unified record per attempt
+        │   └── search_matches_attempt_2_def67890.json  # routine + params + exec + inspection
+        ├── routines/
+        │   ├── get_standings.json   # Shipped routine files
+        │   └── ...
+        ├── agent_threads/
+        │   ├── principal_investigator.json  # PI's full conversation
+        │   ├── worker_abc123.json           # Worker message histories
+        │   └── inspector_get_standings.json # Inspector conversations
+        ├── ledger.json              # Full ledger snapshot (overwritten each time)
+        └── catalog.json             # Final catalog (written on mark_complete)
+    """
+
+    def __init__(self, output_dir: Path) -> None:
+        self._output_dir = output_dir
+        self._experiments_dir = output_dir / "experiments"
+        self._routines_dir = output_dir / "routines"
+        self._threads_dir = output_dir / "agent_threads"
+        self._attempt_records_dir = output_dir / "attempt_records"
+
+        # Create directories
+        for d in [
+            self._experiments_dir, self._routines_dir,
+            self._threads_dir, self._attempt_records_dir,
+        ]:
+            d.mkdir(parents=True, exist_ok=True)
+
+    def on_ledger_change(self, ledger: DiscoveryLedger, reason: str) -> None:
+        """
+        Called by the PI after every ledger mutation.
+
+        Writes:
+        1. Each experiment as experiments/exp_{id}.json
+        2. Shipped routines as routines/{name}.json
+        3. Full ledger snapshot as ledger.json
+        4. Catalog as catalog.json (if built)
+        """
+        # Individual experiment files
+        for exp in ledger.experiments:
+            exp_path = self._experiments_dir / f"exp_{exp.id}.json"
+            _write_json(exp_path, exp.model_dump())
+
+        # Shipped routine files
+        for spec in ledger.routine_specs:
+            if spec.shipped_attempt_id:
+                attempt = ledger.get_attempt(spec.shipped_attempt_id)
+                if attempt:
+                    routine_path = self._routines_dir / f"{spec.name}.json"
+                    _write_json(routine_path, attempt.routine_json)
+
+        # Full ledger snapshot
+        _write_json(self._output_dir / "ledger.json", ledger.model_dump())
+
+        # Catalog (if built)
+        if ledger.catalog is not None:
+            _write_json(self._output_dir / "catalog.json", ledger.catalog.model_dump())
+
+        logger.debug("Persisted ledger to disk (reason: %s)", reason)
+
+    def on_attempt_record(self, record: dict[str, Any]) -> None:
+        """
+        Called by the PI when attempt state changes.
+        Writes a unified record with routine JSON, test parameters, execution
+        result, and inspection result in one file, keyed by attempt_id.
+        Subsequent writes for the same attempt overwrite the same file.
+        """
+        attempt_id = record.get("attempt_id", "unknown")
+        spec_name = record.get("spec_name", "unknown")
+        attempt_number = record.get("attempt_number", 0)
+        filename = f"{spec_name}_attempt_{attempt_number}_{attempt_id[:8]}.json"
+        record_path = self._attempt_records_dir / filename
+        _write_json(record_path, record)
+        inspection = record.get("inspection_result")
+        if isinstance(inspection, dict):
+            inspection = inspection.get("output", inspection)
+        logger.debug(
+            "Persisted attempt record: %s (verdict=%s, pass=%s)",
+            filename,
+            record.get("verdict"),
+            inspection.get("overall_pass") if isinstance(inspection, dict) else None,
+        )
+
+    def on_agent_thread(
+        self,
+        agent_label: str,
+        thread_id: str,
+        messages: list[dict[str, Any]],
+    ) -> None:
+        """
+        Called by the PI after a worker/inspector finishes or PI itself completes.
+
+        Writes the full message history to agent_threads/{label}.json.
+        """
+        thread_path = self._threads_dir / f"{agent_label}.json"
+        _write_json(thread_path, {
+            "agent_label": agent_label,
+            "thread_id": thread_id,
+            "message_count": len(messages),
+            "messages": messages,
+        })
+        logger.debug("Persisted agent thread: %s (%d messages)", agent_label, len(messages))
+
+
+def run_pi_with_recovery(
+    task: str,
+    summaries: dict[str, str],
+    cdp_captures_dir: Path,
+    output_dir: Path,
+    llm_model: LLMModel,
+    remote_debugging_address: str,
+    max_pi_iterations: int,
+    min_experiments_before_fail: int = 10,
+    num_workers: int = 3,
+    num_inspectors: int = 1,
+    max_pi_attempts: int = DEFAULT_MAX_PI_ATTEMPTS,
+) -> RoutineCatalog | None:
+    """
+    Run the PrincipalInvestigator with automatic recovery.
+
+    If the PI fails for ANY reason (context exhaustion, API error, etc.),
+    it preserves the DiscoveryLedger and spins up a fresh PI to continue.
+    Retries up to max_pi_attempts total (default 3).
+    """
+    # Build data loaders
+    network_loader = _load_if_exists(
+        NetworkDataLoader, cdp_captures_dir / "network" / "events.jsonl",
+    )
+    storage_loader = _load_if_exists(
+        StorageDataLoader, cdp_captures_dir / "storage" / "events.jsonl",
+    )
+    dom_loader = _load_if_exists(
+        DOMDataLoader, cdp_captures_dir / "dom" / "events.jsonl",
+    )
+    window_prop_loader = _load_if_exists(
+        WindowPropertyDataLoader, cdp_captures_dir / "window_properties" / "events.jsonl",
+    )
+
+    # Documentation loader — gives PI access to Routine schema docs and source code
+    docs_dir = str(BLUEBOX_PACKAGE_ROOT / "agent_docs")
+    code_paths = [
+        str(BLUEBOX_PACKAGE_ROOT / "data_models" / "routine"),
+        str(BLUEBOX_PACKAGE_ROOT / "data_models" / "ui_elements.py"),
+        str(BLUEBOX_PACKAGE_ROOT / "agents" / "routine_discovery_agent.py"),
+        str(BLUEBOX_PACKAGE_ROOT / "llms" / "infra" / "data_store.py"),
+        str(BLUEBOX_PACKAGE_ROOT / "utils" / "js_utils.py"),
+        str(BLUEBOX_PACKAGE_ROOT / "utils" / "data_utils.py"),
+        "!" + str(BLUEBOX_PACKAGE_ROOT / "**" / "__init__.py"),
+    ]
+    documentation_data_loader = DocumentationDataLoader(
+        documentation_paths=[docs_dir],
+        code_paths=code_paths,
+    )
+
+    # Persistence layer — writes to disk incrementally
+    persistence = PipelinePersistence(output_dir)
+    agent_workspaces_root = output_dir / "agent_workspaces"
+    capture_inputs = _collect_capture_input_files(cdp_captures_dir)
+    pi_workspace = LocalAgentWorkspace.from_directory_path(agent_workspaces_root / "PI")
+    _mount_capture_inputs(pi_workspace, capture_inputs)
+    worker_idx_counter = count(start=1)
+    inspector_idx_counter = count(start=1)
+
+    def _make_worker_workspace() -> LocalAgentWorkspace:
+        worker_idx = next(worker_idx_counter)
+        worker_root = agent_workspaces_root / f"worker_{worker_idx}"
+        worker_workspace = LocalAgentWorkspace.from_directory_path(worker_root)
+        _mount_capture_inputs(worker_workspace, capture_inputs)
+        return worker_workspace
+
+    def _make_inspector_workspace() -> LocalAgentWorkspace:
+        inspector_idx = next(inspector_idx_counter)
+        inspector_root = agent_workspaces_root / f"inspector_{inspector_idx}"
+        inspector_workspace = LocalAgentWorkspace.from_directory_path(inspector_root)
+        _mount_capture_inputs(inspector_workspace, capture_inputs)
+        return inspector_workspace
+
+    ledger: DiscoveryLedger | None = None
+    catalog: RoutineCatalog | None = None
+
+    print("\n=== Phase 2: Routine Construction (PI loop) ===\n", file=sys.stderr)
+
+    for attempt in range(max_pi_attempts):
+        if attempt > 0:
+            print(
+                f"\n  [!] PI attempt {attempt + 1}/{max_pi_attempts} "
+                f"(fresh PI, ledger preserved)\n",
+                file=sys.stderr,
+            )
+
+        pi = PrincipalInvestigator(
+            emit_message_callable=_emit_message,
+            task=task,
+            exploration_summaries=summaries,
+            network_data_loader=network_loader,
+            storage_data_loader=storage_loader,
+            dom_data_loader=dom_loader,
+            window_property_data_loader=window_prop_loader,
+            documentation_data_loader=documentation_data_loader,
+            remote_debugging_address=remote_debugging_address,
+            llm_model=llm_model,
+            ledger=ledger,
+            max_iterations=max_pi_iterations,
+            min_experiments_before_fail=min_experiments_before_fail,
+            num_workers=num_workers,
+            num_inspectors=num_inspectors,
+            on_ledger_change=persistence.on_ledger_change,
+            on_agent_thread=persistence.on_agent_thread,
+            on_attempt_record=persistence.on_attempt_record,
+            workspace=pi_workspace,
+            worker_workspace_factory=_make_worker_workspace,
+            inspector_workspace_factory=_make_inspector_workspace,
+        )
+
+        try:
+            catalog = pi.run()
+            break
+        except Exception as e:
+            logger.error("PI attempt %d/%d failed: %s", attempt + 1, max_pi_attempts, e)
+            # Preserve ledger for next attempt
+            ledger = pi._ledger
+            if ledger.experiments:
+                persistence.on_ledger_change(ledger, f"recovery_attempt_{attempt + 1}")
+
+            if attempt + 1 >= max_pi_attempts:
+                print(
+                    f"\n  [!] All {max_pi_attempts} PI attempts exhausted. "
+                    "Returning partial results.\n",
+                    file=sys.stderr,
+                )
+                # Return whatever was shipped before the crash
+                return pi._build_partial_catalog()
+        finally:
+            pi.close()
+
+    return catalog
+
+
+# ---------------------------------------------------------------------------
+# Main pipeline
+# ---------------------------------------------------------------------------
+
+
+def run_api_indexing(
+    cdp_captures_dir: Path,
+    task: str,
+    output_dir: Path = Path("./api_indexing_output"),
+    llm_model: LLMModel | None = None,
+    remote_debugging_address: str = "http://127.0.0.1:9222",
+    skip_exploration: bool = False,
+    max_pi_iterations: int = 200,
+    min_experiments_before_fail: int = 10,
+    num_workers: int = 3,
+    num_inspectors: int = 1,
+    max_pi_attempts: int = DEFAULT_MAX_PI_ATTEMPTS,
+    post_run_analysis: bool = False,
+) -> RoutineCatalog | None:
+    """
+    Run the full API indexing pipeline end-to-end.
+
+    Phase 1: 4 parallel explorations (network, storage, DOM, UI)
+    Phase 2: PI loop with experiment workers
+
+    Args:
+        cdp_captures_dir: Path to CDP captures directory.
+        task: What the user was trying to do.
+        output_dir: Where to write output files.
+        llm_model: LLM model to use.
+        remote_debugging_address: Chrome debugging URL for live browser experiments.
+        skip_exploration: Skip Phase 1, load existing summaries from output_dir.
+        max_pi_iterations: Max PI loop iterations per session.
+        min_experiments_before_fail: Min experiments before PI can call mark_failed.
+        num_workers: Max concurrent ExperimentWorker agents (default 3).
+        num_inspectors: Max concurrent RoutineInspector agents (default 1).
+        max_pi_attempts: Max number of PI recovery attempts on failure (default 3).
+        post_run_analysis: If true, run analyze_pipeline_output and write
+            output_dir/analysis.txt after pipeline completion.
+
+    Returns:
+        RoutineCatalog if successful, None if no routines could be built.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    start_time = time.time()
+    resolved_llm_model = llm_model or get_model_by_value(DEFAULT_MODEL_VALUE)
+    if resolved_llm_model is None:
+        raise RuntimeError(f"Invalid default model configured: {DEFAULT_MODEL_VALUE}")
+
+    print(f"\nAPI Indexing Pipeline", file=sys.stderr)
+    print(f"  Task: {task}", file=sys.stderr)
+    print(f"  Captures: {cdp_captures_dir}", file=sys.stderr)
+    print(f"  Output: {output_dir}", file=sys.stderr)
+    print(f"  Model: {resolved_llm_model.value}", file=sys.stderr)
+    remote_debugging_address = remote_debugging_address.strip()
+    print(f"  Browser: {remote_debugging_address}", file=sys.stderr)
+
+    if not remote_debugging_address:
+        print(
+            "\n  [!] Chrome remote debugging address is empty. "
+            "Provide --remote-debugging-address (example: http://127.0.0.1:9222).",
+            file=sys.stderr,
+        )
+        return None
+    try:
+        get_browser_websocket_url(remote_debugging_address)
+    except Exception as e:
+        print(
+            f"\n  [!] Chrome is not reachable at {remote_debugging_address}. "
+            "Ensure Chrome is running with --remote-debugging-port and that the address is correct. "
+            f"Underlying error: {e}",
+            file=sys.stderr,
+        )
+        return None
+
+    # Phase 1: Exploration
+    if skip_exploration:
+        print("\n  Skipping exploration (--skip-exploration), loading from disk...", file=sys.stderr)
+        summaries = load_explorations(output_dir)
+    else:
+        summaries = run_explorations(cdp_captures_dir, output_dir, resolved_llm_model)
+
+    if not summaries:
+        print("\n  [!] No exploration summaries available. Cannot proceed.", file=sys.stderr)
+        if post_run_analysis:
+            _run_post_run_analysis(output_dir)
+        return None
+
+    # Clean up Phase 2 artifacts from previous runs (preserve exploration + workspaces)
+    for subdir in [
+        "experiments",
+        "attempts",  # legacy output dir from older runs
+        "attempt_records",
+        "routines",
+        "agent_threads",
+    ]:
+        p = output_dir / subdir
+        if p.exists():
+            shutil.rmtree(p)
+            logger.info("Cleaned up %s", p)
+    for f in ["ledger.json", "catalog.json"]:
+        p = output_dir / f
+        if p.exists():
+            p.unlink()
+            logger.info("Cleaned up %s", p)
+
+    # Phase 2: PI loop
+    catalog = run_pi_with_recovery(
+        task=task,
+        summaries=summaries,
+        cdp_captures_dir=cdp_captures_dir,
+        output_dir=output_dir,
+        llm_model=resolved_llm_model,
+        remote_debugging_address=remote_debugging_address,
+        max_pi_iterations=max_pi_iterations,
+        min_experiments_before_fail=min_experiments_before_fail,
+        num_workers=num_workers,
+        num_inspectors=num_inspectors,
+        max_pi_attempts=max_pi_attempts,
+    )
+
+    elapsed = time.time() - start_time
+
+    # Final summary
+    print(f"\n=== Pipeline Complete ({elapsed:.1f}s) ===\n", file=sys.stderr)
+    if catalog:
+        print(f"  Routines shipped: {len(catalog.routines)}", file=sys.stderr)
+        print(f"  Routines failed:  {len(catalog.failed_routines)}", file=sys.stderr)
+        print(f"  Experiments run:  {catalog.total_experiments}", file=sys.stderr)
+        print(f"  Total attempts:   {catalog.total_attempts}", file=sys.stderr)
+        for routine in catalog.routines:
+            print(f"    [+] {routine.name} — {routine.description}", file=sys.stderr)
+        for failed in catalog.failed_routines:
+            print(f"    [-] {failed['name']} — {failed.get('reason', '?')}", file=sys.stderr)
+        print(f"\n  Output: {output_dir}", file=sys.stderr)
+    else:
+        print("  No routines produced.", file=sys.stderr)
+
+    if post_run_analysis:
+        _run_post_run_analysis(output_dir)
+
+    return catalog
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    """CLI entrypoint for the API indexing pipeline."""
+    parser = argparse.ArgumentParser(
+        description="Run the full API indexing pipeline: exploration → experimentation → routine catalog",
+    )
+    parser.add_argument(
+        "--cdp-captures-dir",
+        type=Path,
+        required=True,
+        help="Path to CDP captures directory",
+    )
+    parser.add_argument(
+        "--task",
+        type=str,
+        required=True,
+        help="Task description — what the user was doing in the captured session",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("./api_indexing_output"),
+        help="Where to write output files (default: ./api_indexing_output)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=DEFAULT_MODEL_VALUE,
+        help=f"LLM model to use (default: {DEFAULT_MODEL_VALUE}). Options: {', '.join(get_all_model_values())}",
+    )
+    parser.add_argument(
+        "--remote-debugging-address",
+        type=str,
+        default="http://127.0.0.1:9222",
+        help="Chrome remote debugging address (default: http://127.0.0.1:9222)",
+    )
+    parser.add_argument(
+        "--skip-exploration",
+        action="store_true",
+        help="Skip Phase 1 exploration, load existing summaries from output-dir",
+    )
+    parser.add_argument(
+        "--max-pi-iterations",
+        type=int,
+        default=200,
+        help="Max PI loop iterations per session (default: 200)",
+    )
+    parser.add_argument(
+        "--min-experiments-before-fail",
+        type=int,
+        default=10,
+        help="Min experiments before PI can abandon the pipeline (default: 10)",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=3,
+        help="Max concurrent ExperimentWorker agents (default: 3)",
+    )
+    parser.add_argument(
+        "--num-inspectors",
+        type=int,
+        default=1,
+        help="Max concurrent RoutineInspector agents (default: 1)",
+    )
+    parser.add_argument(
+        "--max-pi-attempts",
+        type=int,
+        default=DEFAULT_MAX_PI_ATTEMPTS,
+        help="Max PI recovery attempts on context exhaustion or failure (default: 3)",
+    )
+    parser.add_argument(
+        "--post-run-analysis",
+        "--post-run-analusis",
+        action="store_true",
+        dest="post_run_analysis",
+        help="Run analyze_pipeline_output after pipeline and write output-dir/analysis.txt",
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Enable verbose logging",
+    )
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    if not args.cdp_captures_dir.exists():
+        print(f"Error: {args.cdp_captures_dir} does not exist", file=sys.stderr)
+        sys.exit(1)
+
+    llm_model = get_model_by_value(args.model)
+    if llm_model is None:
+        print(f"Error: Unknown model '{args.model}'", file=sys.stderr)
+        print(f"Available models: {', '.join(get_all_model_values())}", file=sys.stderr)
+        sys.exit(1)
+
+    catalog = run_api_indexing(
+        cdp_captures_dir=args.cdp_captures_dir,
+        task=args.task,
+        output_dir=args.output_dir,
+        llm_model=llm_model,
+        remote_debugging_address=args.remote_debugging_address,
+        skip_exploration=args.skip_exploration,
+        max_pi_iterations=args.max_pi_iterations,
+        min_experiments_before_fail=args.min_experiments_before_fail,
+        num_workers=args.num_workers,
+        num_inspectors=args.num_inspectors,
+        max_pi_attempts=args.max_pi_attempts,
+        post_run_analysis=args.post_run_analysis,
+    )
+
+    if catalog is None:
+        print("\nPipeline produced no routines.", file=sys.stderr)
+        sys.exit(1)
+
+    # Print catalog JSON to stdout
+    print(catalog.model_dump_json(indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bluebox/scripts/api_indexing/run_dom_exploration.py b/bluebox/scripts/api_indexing/run_dom_exploration.py
new file mode 100644
index 00000000..6979558d
--- /dev/null
+++ b/bluebox/scripts/api_indexing/run_dom_exploration.py
@@ -0,0 +1,378 @@
+"""
+bluebox/scripts/api_indexing/run_dom_exploration.py
+
+Prototype script for Phase 1 (Exploration) — DOM domain.
+
+Runs the DOMSpecialist in autonomous mode with an output schema matching
+DOMExplorationSummary. The agent uses DOM tools (list_pages, get_forms,
+get_meta_tags, get_scripts, get_hidden_inputs, etc.) to produce a
+structured summary of what exists on the page.
+
+Usage:
+    python -m bluebox.scripts.api_indexing.run_dom_exploration --cdp-captures-dir ./cdp_captures
+    python -m bluebox.scripts.api_indexing.run_dom_exploration --cdp-captures-dir ./cdp_captures --model gpt-5.1
+    python -m bluebox.scripts.api_indexing.run_dom_exploration --cdp-captures-dir ./cdp_captures --output /tmp/exploration_dom.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Any
+
+from bluebox.agents.abstract_agent import AutonomousRunConfig
+from bluebox.agents.specialists.dom_specialist import DOMSpecialist
+from bluebox.workspace import LocalAgentWorkspace
+from bluebox.data_models.api_indexing.exploration import DOMExplorationSummary
+from bluebox.data_models.llms.interaction import EmittedMessage
+from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
+from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader
+from bluebox.utils.logger import get_logger
+
+logger = get_logger(name=__name__)
+
+# ---------------------------------------------------------------------------
+# Output schema — derived from DOMExplorationSummary
+# ---------------------------------------------------------------------------
+
+DOM_EXPLORATION_OUTPUT_SCHEMA: dict[str, Any] = {
+    "type": "object",
+    "properties": {
+        "total_snapshots": {
+            "type": "integer",
+            "description": "Total number of DOM snapshots in the capture.",
+        },
+        "pages": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": "Each entry describes one visited page: URL, title, key elements, purpose.",
+        },
+        "forms": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": "Each entry describes one form: page, action URL, method, fields, purpose.",
+        },
+        "embedded_tokens": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": "Each entry describes one token/key in the DOM: location, name, type, size.",
+        },
+        "data_blobs": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": "Each entry describes one server-side data blob: container, data shape, size.",
+        },
+        "tables": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": "Each entry describes one data table: page, columns, row count, data shown.",
+        },
+        "inferred_framework": {
+            "type": "string",
+            "description": "Frontend framework inferred from DOM signals (e.g. 'Next.js', 'Angular', 'vanilla/unknown').",
+        },
+        "narrative": {
+            "type": "string",
+            "description": "Freeform observations: navigation flow, patterns, anything else worth noting.",
+        },
+    },
+    "required": [
+        "total_snapshots",
+        "pages",
+        "forms",
+        "embedded_tokens",
+        "data_blobs",
+        "tables",
+        "inferred_framework",
+        "narrative",
+    ],
+}
+
+# ---------------------------------------------------------------------------
+# Autonomous system prompt — DOM exploration
+# ---------------------------------------------------------------------------
+
+EXPLORATION_SYSTEM_PROMPT = """
+You are a DOM exploration agent. Survey ALL captured browser snapshots
+to produce a structured summary of what exists on each page.
+
+## Your Mission
+
+1. **Pages** — what was visited, what forms/inputs/buttons exist on each page
+2. **Tokens** — CSRF tokens in meta tags, hidden inputs, API keys in scripts
+3. **Data blobs** — __NEXT_DATA__, ld+json, inline JSON config, __NUXT__
+4. **Tables** — data tables with columns and row counts
+5. **Framework** — infer from DOM signals (Next.js -> __NEXT_DATA__, Angular -> ng-*, ASP.NET -> __VIEWSTATE, React -> data-reactroot, Vue -> data-v-*)
+
+## Process
+
+1. `list_pages` + `get_navigation_sequence` -> survey all pages
+2. `get_forms` -> forms with fields and actions
+3. `get_meta_tags` + `get_hidden_inputs` -> embedded tokens
+4. `get_scripts` -> server-side data blobs and framework signals
+5. `get_tables` -> data displays
+6. `finalize_with_output(output={...})` with the COMPLETE JSON
+
+## How to Finalize
+
+Pass the ENTIRE output as a single JSON object. Example:
+
+```
+finalize_with_output(output={
+  "total_snapshots": 5,
+  "pages": ["/ (Home) -- landing page with search widget"],
+  "forms": ["/book/flights search form -- POST, fields: origin, destination, date"],
+  "embedded_tokens": ["meta[name=csrf-token] -- 64-char hex, rotates per load"],
+  "data_blobs": ["script#__NEXT_DATA__ -- JSON (~15kb) with feature flags"],
+  "tables": [],
+  "inferred_framework": "Angular",
+  "narrative": "SPA with Angular, search form submits to /api/search..."
+})
+```
+
+## Guidelines
+
+- Examine ALL snapshots, not just the first one.
+- For tokens, note whether they rotate across snapshots or stay static.
+- For data blobs, mention interesting keys, not raw content.
+- Keep descriptions to one sentence each.
+- Sort pages in navigation order.
+""".strip()
+
+
+def _emit_message(msg: EmittedMessage) -> None:
+    """Print emitted messages to stderr for visibility."""
+    if hasattr(msg, "content") and msg.content:
+        print(f"[agent] {msg.content}", file=sys.stderr)
+    elif hasattr(msg, "error") and msg.error:
+        print(f"[error] {msg.error}", file=sys.stderr)
+
+
+def _resolve_model(model_str: str) -> LLMModel:
+    """Resolve a model string to an LLMModel enum value."""
+    for member in OpenAIModel:
+        if member.value == model_str:
+            return member
+    raise ValueError(
+        f"Unknown model: {model_str}. "
+        f"Available: {[m.value for m in OpenAIModel]}"
+    )
+
+
+def run_dom_exploration(
+    cdp_captures_dir: Path,
+    llm_model: LLMModel = OpenAIModel.GPT_5_1,
+    min_iterations: int = 3,
+    max_iterations: int = 15,
+    workspace_dir: Path | None = None,
+) -> DOMExplorationSummary | None:
+    """
+    Run DOM exploration on a CDP captures directory.
+
+    Args:
+        cdp_captures_dir: Path to directory containing dom/events.jsonl.
+        llm_model: LLM model to use.
+        min_iterations: Minimum iterations before finalize is available.
+        max_iterations: Maximum iterations before the loop exits.
+        workspace_dir: Workspace directory for artifacts and mounted inputs.
+
+    Returns:
+        DOMExplorationSummary if successful, None if the agent failed or timed out.
+    """
+    dom_jsonl = cdp_captures_dir / "dom" / "events.jsonl"
+    if not dom_jsonl.exists():
+        logger.error("dom/events.jsonl not found in %s", cdp_captures_dir)
+        return None
+
+    # Load DOM data
+    dom_loader = DOMDataLoader(jsonl_path=str(dom_jsonl))
+    logger.info(
+        "Loaded %d DOM snapshots (%d unique URLs, %d unique titles)",
+        dom_loader.stats.total_snapshots,
+        dom_loader.stats.unique_urls,
+        dom_loader.stats.unique_titles,
+    )
+
+    workspace = LocalAgentWorkspace.from_directory_path(
+        workspace_dir or Path("./agent_workspace/dom_exploration"),
+    )
+    workspace.attach_input_file("dom_events", dom_jsonl)
+
+    specialist = DOMSpecialist(
+        emit_message_callable=_emit_message,
+        dom_data_loader=dom_loader,
+        llm_model=llm_model,
+        workspace=workspace,
+    )
+
+    # Bump max_output_tokens for the finalize call
+    specialist.llm_client._client.DEFAULT_MAX_TOKENS = 16_384
+
+    # Monkey-patch the autonomous system prompt for exploration
+    def _exploration_system_prompt() -> str:
+        stats = dom_loader.stats
+        context_parts: list[str] = [
+            EXPLORATION_SYSTEM_PROMPT,
+            f"\n\n## DOM Data Context\n"
+            f"- Snapshots: {stats.total_snapshots}\n"
+            f"- Unique URLs: {stats.unique_urls}\n"
+            f"- Hosts: {', '.join(stats.hosts.keys())}",
+        ]
+
+        pages = dom_loader.list_pages()
+        if pages:
+            page_lines = [f"- [{p['index']}] {p['url']} — {p['title']}" for p in pages[:20]]
+            context_parts.append(f"\n\n## Captured Pages\n" + "\n".join(page_lines))
+
+        return (
+            "".join(context_parts)
+            + specialist._get_output_schema_prompt_section()
+            + specialist._get_urgency_notice()
+        )
+
+    specialist._get_autonomous_system_prompt = _exploration_system_prompt  # type: ignore[assignment]
+
+    # Override initial message so exploration framing is explicit.
+    def _exploration_initial_message(task_text: str) -> str:
+        return (
+            f"DOM EXPLORATION TASK: {task_text}\n\n"
+            "This is broad DOM exploration. Cover pages, forms, embedded tokens, "
+            "data blobs, and table structures across snapshots, then finalize with "
+            "the complete structured output."
+        )
+
+    specialist._get_autonomous_initial_message = _exploration_initial_message  # type: ignore[assignment]
+
+    # Build task message
+    task = (
+        "Explore ALL DOM snapshots in this capture. Survey pages, scan forms, "
+        "find embedded tokens (meta tags, hidden inputs), discover data blobs "
+        "(scripts with __NEXT_DATA__, ld+json), examine tables, and infer the "
+        f"frontend framework. There are {dom_loader.stats.total_snapshots} snapshots. "
+        "Then call finalize_with_output(output={...}) with the COMPLETE JSON."
+    )
+
+    config = AutonomousRunConfig(
+        min_iterations=min_iterations,
+        max_iterations=max_iterations,
+    )
+
+    result = specialist.run_autonomous(
+        task=task,
+        config=config,
+        output_schema=DOM_EXPLORATION_OUTPUT_SCHEMA,
+        output_description=(
+            "A DOMExplorationSummary with pages, forms, tokens, data blobs, "
+            "tables, framework, and narrative."
+        ),
+    )
+
+    if result is None:
+        logger.warning("DOM exploration did not produce a result (timed out or failed)")
+        return None
+
+    if not result.success:
+        logger.warning("DOM exploration failed: %s", result.failure_reason)
+        return None
+
+    # Parse into our Pydantic model
+    try:
+        summary = DOMExplorationSummary(**result.output)
+        logger.info(
+            "DOM exploration complete: %d pages, %d forms, %d tokens, %d blobs, framework: %s",
+            len(summary.pages),
+            len(summary.forms),
+            len(summary.embedded_tokens),
+            len(summary.data_blobs),
+            summary.inferred_framework,
+        )
+        return summary
+    except Exception as e:
+        logger.error("Failed to parse exploration output: %s", e)
+        logger.debug("Raw output: %s", result.output)
+        return None
+
+
+def main() -> None:
+    """CLI entrypoint for DOM exploration."""
+    parser = argparse.ArgumentParser(
+        description="Run Phase 1 DOM Exploration on CDP captures",
+    )
+    parser.add_argument(
+        "--cdp-captures-dir",
+        type=Path,
+        required=True,
+        help="Path to CDP captures directory (expects dom/events.jsonl inside)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="gpt-5.1",
+        help="LLM model to use (default: gpt-5.1)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("exploration_dom.json"),
+        help="Path to write output JSON (default: exploration_dom.json)",
+    )
+    parser.add_argument(
+        "--workspace-dir",
+        type=Path,
+        default=Path("./agent_workspace/dom_exploration"),
+        help="Workspace directory for artifacts and mounted inputs.",
+    )
+    parser.add_argument(
+        "--min-iterations",
+        type=int,
+        default=3,
+        help="Minimum iterations before finalize is available (default: 3)",
+    )
+    parser.add_argument(
+        "--max-iterations",
+        type=int,
+        default=15,
+        help="Maximum iterations before the loop exits (default: 15)",
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Enable verbose logging",
+    )
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    if not args.cdp_captures_dir.exists():
+        print(f"Error: {args.cdp_captures_dir} does not exist", file=sys.stderr)
+        sys.exit(1)
+
+    llm_model = _resolve_model(args.model)
+
+    summary = run_dom_exploration(
+        cdp_captures_dir=args.cdp_captures_dir,
+        llm_model=llm_model,
+        min_iterations=args.min_iterations,
+        max_iterations=args.max_iterations,
+        workspace_dir=args.workspace_dir,
+    )
+
+    if summary is None:
+        print("DOM exploration failed — no output produced.", file=sys.stderr)
+        sys.exit(1)
+
+    output_json = summary.model_dump_json(indent=2)
+
+    if args.output:
+        args.output.write_text(output_json)
+        print(f"DOM exploration summary written to {args.output}", file=sys.stderr)
+    else:
+        print(output_json)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bluebox/scripts/api_indexing/run_network_exploration.py b/bluebox/scripts/api_indexing/run_network_exploration.py
new file mode 100644
index 00000000..a75189b7
--- /dev/null
+++ b/bluebox/scripts/api_indexing/run_network_exploration.py
@@ -0,0 +1,461 @@
+"""
+bluebox/scripts/api_indexing/run_network_exploration.py
+
+Prototype script for Phase 1 (Exploration) — Network domain.
+
+Runs the NetworkSpecialist in autonomous mode with an output schema
+matching NetworkExplorationSummary. The agent uses its existing tools
+(search_responses_by_terms, get_entry_detail, get_response_body_schema,
+get_unique_urls, etc.) to explore captured traffic and produce a
+structured exploration summary.
+
+Usage:
+    python -m bluebox.scripts.api_indexing.run_network_exploration --cdp-captures-dir ./cdp_captures
+    python -m bluebox.scripts.api_indexing.run_network_exploration --cdp-captures-dir ./cdp_captures --model gpt-5.1
+    python -m bluebox.scripts.api_indexing.run_network_exploration --cdp-captures-dir ./cdp_captures --output /tmp/exploration.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import sys
+from pathlib import Path
+from typing import Any
+
+from bluebox.agents.abstract_agent import AutonomousRunConfig
+from bluebox.agents.specialists.network_specialist import NetworkSpecialist
+from bluebox.workspace import LocalAgentWorkspace
+from bluebox.data_models.api_indexing.exploration import (
+    EndpointCategory,
+    EndpointCluster,
+    InterestLevel,
+    NetworkExplorationSummary,
+)
+from bluebox.data_models.llms.interaction import EmittedMessage
+from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
+from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
+from bluebox.utils.logger import get_logger
+
+logger = get_logger(name=__name__)
+
+# ---------------------------------------------------------------------------
+# Output schema — derived from NetworkExplorationSummary
+# ---------------------------------------------------------------------------
+
+NETWORK_EXPLORATION_OUTPUT_SCHEMA: dict[str, Any] = {
+    "type": "object",
+    "properties": {
+        "total_requests": {
+            "type": "integer",
+            "description": "Total number of requests in the capture.",
+        },
+        "endpoints": {
+            "type": "array",
+            "description": "Discovered endpoints, sorted by interest level (high first).",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "url_pattern": {
+                        "type": "string",
+                        "description": "Deduplicated URL pattern, e.g. '/api/v2/search'.",
+                    },
+                    "method": {
+                        "type": "string",
+                        "description": "HTTP method, e.g. 'POST'.",
+                    },
+                    "category": {
+                        "type": "string",
+                        "enum": [e.value for e in EndpointCategory],
+                        "description": "What role this endpoint plays: action, data, auth, or navigation.",
+                    },
+                    "hit_count": {
+                        "type": "integer",
+                        "description": "How many times this endpoint was called in the session.",
+                    },
+                    "description": {
+                        "type": "string",
+                        "description": "What this endpoint does.",
+                    },
+                    "interest": {
+                        "type": "string",
+                        "enum": [e.value for e in InterestLevel],
+                        "description": "How relevant for routine construction: high (core business endpoints), medium (supporting/config), low (noise/analytics).",
+                    },
+                    "request_ids": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "References to raw captured request IDs.",
+                    },
+                },
+                "required": [
+                    "url_pattern",
+                    "method",
+                    "category",
+                    "hit_count",
+                    "description",
+                    "interest",
+                    "request_ids",
+                ],
+            },
+        },
+        "auth_observations": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": "Observed auth patterns, e.g. 'Bearer JWT on all /api/ requests'.",
+        },
+        "narrative": {
+            "type": "string",
+            "description": "Free-form observations: oddities, patterns, anything that doesn't fit the structured fields.",
+        },
+    },
+    "required": [
+        "total_requests",
+        "endpoints",
+        "auth_observations",
+        "narrative",
+    ],
+}
+
+# ---------------------------------------------------------------------------
+# Autonomous system prompt override — exploration-specific
+# ---------------------------------------------------------------------------
+
+EXPLORATION_SYSTEM_PROMPT = """
+You are a network traffic exploration agent. Your job is to survey ALL captured
+network traffic and produce a comprehensive exploration summary.
+
+## Your Mission
+
+Explore the full network capture and produce a structured summary that answers:
+1. How many requests were captured? How many are noise (analytics, static assets, CDN)?
+2. What are the distinct API endpoints? Group by URL pattern, not individual requests.
+3. What HTTP methods and content types are used?
+4. What auth patterns are visible? (Bearer tokens, cookies, CSRF headers, API keys)
+5. Which endpoints are most interesting for automating user tasks?
+
+## Process
+
+1. **Survey**: Start with `get_unique_urls` to see all URLs in the capture.
+2. **Categorize**: For each interesting URL pattern, use `get_entry_detail` to understand
+   what it does. Distinguish between:
+   - **action** endpoints (POST/PUT that do something — search, submit, create)
+   - **data** endpoints (GET that return meaningful data — user info, config, prices)
+   - **auth** endpoints (token endpoints, login, refresh, CSRF fetches)
+   - **navigation** endpoints (HTML page loads, redirects)
+3. **Investigate auth**: Look at request headers across multiple entries to identify
+   auth patterns (Bearer tokens, cookies, CSRF headers, API keys).
+4. **Rate interest**: Label each endpoint's interest level:
+   - **high**: Core business endpoints the routine will actually call (search, submit, book)
+   - **medium**: Supporting endpoints (reference data, config, session management, auth)
+   - **low**: Noise (analytics, tracking, ads, bot detection, static assets, CDN)
+5. **Filter noise**: Count requests that are clearly noise (analytics trackers, CDN
+   resources, font/image loads, preflight OPTIONS requests).
+6. **Finalize**: Call `finalize_with_output(output={...})` with the COMPLETE JSON object
+   inline. The output must include ALL required fields: total_requests,
+   endpoints (array), auth_observations (array), narrative (string).
+
+## CRITICAL: How to Finalize
+
+When you call `finalize_with_output`, you MUST pass the ENTIRE output as a single JSON
+object in the `output` parameter. Do NOT call it empty or with partial data. Build the
+full summary in your head first, then emit it in one call. Example:
+
+```
+finalize_with_output(output={
+  "total_requests": 150,
+
+  "endpoints": [
+    {
+      "url_pattern": "/api/search",
+      "method": "POST",
+      "category": "action",
+      "hit_count": 2,
+      "description": "Search endpoint",
+      "interest": "high",
+      "request_ids": ["req_123"]
+    }
+  ],
+  "auth_observations": ["Bearer JWT on all /api/ requests"],
+  "narrative": "The site uses a standard REST API pattern..."
+})
+```
+
+Do NOT call finalize_failure unless you genuinely cannot find ANY endpoints in the data.
+
+## Guidelines
+
+- **Focus your investigation on high and medium interest endpoints.** These are the ones
+  that matter for routine construction. Spend your tool calls understanding THESE in detail
+  — inspect their request/response bodies, headers, auth patterns, and payloads.
+- For low-interest endpoints (analytics, tracking, ads, CDN, bot detection), just list
+  them briefly. Do NOT waste tool calls investigating Dynatrace beacons, Sojern pixels,
+  or PerimeterX collectors — name them, mark them low, and move on.
+- Be thorough — examine ALL distinct URL patterns, not just the first few.
+- Group duplicate URLs (same path, different query params) into one endpoint cluster.
+- Include request_ids for high and medium endpoints so we can drill in later.
+  Low-interest endpoints can have empty request_ids.
+- The narrative field is for anything that doesn't fit the structured fields:
+  unexpected patterns, potential issues, rate limiting, error spikes, etc.
+- Sort endpoints by interest level (high first, then medium, then low) in your output.
+- Keep endpoint descriptions concise (one sentence each) to stay within token limits.
+""".strip()
+
+
+def _emit_message(msg: EmittedMessage) -> None:
+    """Print emitted messages to stderr for visibility."""
+    if hasattr(msg, "content") and msg.content:
+        print(f"[agent] {msg.content}", file=sys.stderr)
+    elif hasattr(msg, "error") and msg.error:
+        print(f"[error] {msg.error}", file=sys.stderr)
+
+
+def _resolve_model(model_str: str) -> LLMModel:
+    """Resolve a model string to an LLMModel enum value."""
+    # Try OpenAI models
+    for member in OpenAIModel:
+        if member.value == model_str:
+            return member
+    raise ValueError(
+        f"Unknown model: {model_str}. "
+        f"Available: {[m.value for m in OpenAIModel]}"
+    )
+
+
+def run_network_exploration(
+    cdp_captures_dir: Path,
+    llm_model: LLMModel = OpenAIModel.GPT_5_1,
+    min_iterations: int = 3,
+    max_iterations: int = 15,
+    workspace_dir: Path | None = None,
+) -> NetworkExplorationSummary | None:
+    """
+    Run network exploration on a CDP captures directory.
+
+    Args:
+        cdp_captures_dir: Path to directory containing network/events.jsonl.
+        llm_model: LLM model to use.
+        min_iterations: Minimum iterations before finalize is available.
+        max_iterations: Maximum iterations before the loop exits.
+        workspace_dir: Workspace directory for artifacts and mounted inputs.
+
+    Returns:
+        NetworkExplorationSummary if successful, None if the agent failed or timed out.
+    """
+    # Convention: cdp_captures/network/events.jsonl
+    network_jsonl = cdp_captures_dir / "network" / "events.jsonl"
+    if not network_jsonl.exists():
+        logger.error("network/events.jsonl not found in %s", cdp_captures_dir)
+        return None
+
+    # Load network data
+    loader = NetworkDataLoader(jsonl_path=str(network_jsonl))
+    logger.info(
+        "Loaded %d network entries (%d unique URLs, %d hosts)",
+        loader.stats.total_requests,
+        loader.stats.unique_urls,
+        loader.stats.unique_hosts,
+    )
+
+    workspace = LocalAgentWorkspace.from_directory_path(
+        workspace_dir or Path("./agent_workspace/network_exploration"),
+    )
+    workspace.attach_input_file("network_events", network_jsonl)
+
+    # Build the specialist — override its autonomous prompt for exploration
+    specialist = NetworkSpecialist(
+        emit_message_callable=_emit_message,
+        network_data_loader=loader,
+        llm_model=llm_model,
+        workspace=workspace,
+    )
+
+    # Bump max_output_tokens — the default 4096 is too small for the finalize_with_output
+    # call which needs to emit the full exploration summary as a single JSON tool argument.
+    specialist.llm_client._client.DEFAULT_MAX_TOKENS = 16_384
+
+    # Monkey-patch the autonomous system prompt for exploration
+    def _exploration_system_prompt() -> str:
+        stats = loader.stats
+        stats_context = (
+            f"\n\n## Network Traffic Context\n"
+            f"- Total Requests: {stats.total_requests}\n"
+            f"- Unique URLs: {stats.unique_urls}\n"
+            f"- Unique Hosts: {stats.unique_hosts}\n"
+        )
+
+        likely_urls = loader.api_urls
+        if likely_urls:
+            # Cap count and per-line length so system prompt stays under model context limits.
+            trimmed_lines = []
+            for url in likely_urls[:20]:
+                short_url = url if len(url) <= 220 else url[:220] + "..."
+                trimmed_lines.append(f"- {short_url}")
+            urls_list = "\n".join(trimmed_lines)
+            urls_context = f"\n\n## Likely API Endpoints\n{urls_list}"
+        else:
+            urls_context = ""
+
+        host_stats = loader.get_host_stats()
+        if host_stats:
+            host_lines = []
+            for hs in host_stats[:15]:
+                methods_str = ", ".join(f"{m}:{c}" for m, c in sorted(hs["methods"].items()))
+                host_lines.append(f"- {hs['host']}: {hs['request_count']} reqs ({methods_str})")
+            host_context = f"\n\n## Host Statistics\n" + "\n".join(host_lines)
+        else:
+            host_context = ""
+
+        return (
+            EXPLORATION_SYSTEM_PROMPT
+            + stats_context
+            + host_context
+            + urls_context
+            + specialist._get_output_schema_prompt_section()
+            + specialist._get_urgency_notice()
+        )
+
+    specialist._get_autonomous_system_prompt = _exploration_system_prompt  # type: ignore[assignment]
+
+    # Override initial message so exploration framing is explicit.
+    def _exploration_initial_message(task_text: str) -> str:
+        return (
+            f"NETWORK EXPLORATION TASK: {task_text}\n\n"
+            "This is broad capture exploration. Survey URLs, clusters, auth signals, "
+            "and endpoint categories across the full dataset, then finalize with the "
+            "complete structured output."
+        )
+
+    specialist._get_autonomous_initial_message = _exploration_initial_message  # type: ignore[assignment]
+
+    # Run autonomous exploration
+    task = (
+        "Explore ALL network traffic in this capture. Use the tools to survey URLs, "
+        "inspect interesting entries, and identify auth patterns. Then call "
+        "finalize_with_output(output={...}) with a COMPLETE JSON object containing: "
+        "total_requests (int), endpoints (array of endpoint objects), "
+        "auth_observations (array of strings), and narrative (string). "
+        "Each endpoint object needs: url_pattern, method, category (action|data|auth|navigation), "
+        "hit_count, description, interest (high|medium|low), request_ids (array)."
+    )
+
+    config = AutonomousRunConfig(
+        min_iterations=min_iterations,
+        max_iterations=max_iterations,
+    )
+
+    result = specialist.run_autonomous(
+        task=task,
+        config=config,
+        output_schema=NETWORK_EXPLORATION_OUTPUT_SCHEMA,
+        output_description=(
+            "A NetworkExplorationSummary with all discovered endpoints categorized "
+            "and scored, auth observations, and a narrative of findings."
+        ),
+    )
+
+    if result is None:
+        logger.warning("Network exploration did not produce a result (timed out or failed)")
+        return None
+
+    # The result is a SpecialistResultWrapper — extract the output
+    if not result.success:
+        logger.warning("Network exploration failed: %s", result.failure_reason)
+        return None
+
+    # Parse into our Pydantic model
+    try:
+        summary = NetworkExplorationSummary(**result.output)
+        logger.info(
+            "Network exploration complete: %d endpoints discovered",
+            len(summary.endpoints),
+        )
+        return summary
+    except Exception as e:
+        logger.error("Failed to parse exploration output: %s", e)
+        logger.debug("Raw output: %s", result.output)
+        return None
+
+
+def main() -> None:
+    """CLI entrypoint for network exploration."""
+    parser = argparse.ArgumentParser(
+        description="Run Phase 1 Network Exploration on CDP captures",
+    )
+    parser.add_argument(
+        "--cdp-captures-dir",
+        type=Path,
+        required=True,
+        help="Path to CDP captures directory (expects network/events.jsonl inside)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="gpt-5.1",
+        help="LLM model to use (default: gpt-5.1)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("exploration_network.json"),
+        help="Path to write output JSON (default: exploration_network.json)",
+    )
+    parser.add_argument(
+        "--workspace-dir",
+        type=Path,
+        default=Path("./agent_workspace/network_exploration"),
+        help="Workspace directory for artifacts and mounted inputs.",
+    )
+    parser.add_argument(
+        "--min-iterations",
+        type=int,
+        default=3,
+        help="Minimum iterations before finalize is available (default: 3)",
+    )
+    parser.add_argument(
+        "--max-iterations",
+        type=int,
+        default=15,
+        help="Maximum iterations before the loop exits (default: 15)",
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Enable verbose logging",
+    )
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    if not args.cdp_captures_dir.exists():
+        print(f"Error: {args.cdp_captures_dir} does not exist", file=sys.stderr)
+        sys.exit(1)
+
+    llm_model = _resolve_model(args.model)
+
+    summary = run_network_exploration(
+        cdp_captures_dir=args.cdp_captures_dir,
+        llm_model=llm_model,
+        min_iterations=args.min_iterations,
+        max_iterations=args.max_iterations,
+        workspace_dir=args.workspace_dir,
+    )
+
+    if summary is None:
+        print("Exploration failed — no output produced.", file=sys.stderr)
+        sys.exit(1)
+
+    output_json = summary.model_dump_json(indent=2)
+
+    if args.output:
+        args.output.write_text(output_json)
+        print(f"Exploration summary written to {args.output}", file=sys.stderr)
+    else:
+        print(output_json)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bluebox/scripts/api_indexing/run_storage_exploration.py b/bluebox/scripts/api_indexing/run_storage_exploration.py
new file mode 100644
index 00000000..2b781453
--- /dev/null
+++ b/bluebox/scripts/api_indexing/run_storage_exploration.py
@@ -0,0 +1,444 @@
+"""
+bluebox/scripts/api_indexing/run_storage_exploration.py
+
+Prototype script for Phase 1 (Exploration) — Storage & Window Properties domain.
+
+Runs the ValueTraceResolverSpecialist in autonomous mode with an output schema
+matching StorageExplorationSummary. The agent uses its existing tools
+(search_everywhere, get_storage_entry, get_storage_by_key, get_window_prop_changes,
+execute_python, etc.) to explore captured storage/window data and produce a
+structured exploration summary focused on tokens and data blocks.
+
+Usage:
+    python -m bluebox.scripts.api_indexing.run_storage_exploration --cdp-captures-dir ./cdp_captures
+    python -m bluebox.scripts.api_indexing.run_storage_exploration --cdp-captures-dir ./cdp_captures --model gpt-5.1
+    python -m bluebox.scripts.api_indexing.run_storage_exploration --cdp-captures-dir ./cdp_captures --output /tmp/storage_exploration.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Any
+
+from bluebox.agents.abstract_agent import AutonomousRunConfig
+from bluebox.agents.specialists.value_trace_resolver_specialist import ValueTraceResolverSpecialist
+from bluebox.workspace import LocalAgentWorkspace
+from bluebox.data_models.api_indexing.exploration import StorageExplorationSummary
+from bluebox.data_models.llms.interaction import EmittedMessage
+from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
+from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
+from bluebox.llms.data_loaders.storage_data_loader import StorageDataLoader
+from bluebox.llms.data_loaders.window_property_data_loader import WindowPropertyDataLoader
+from bluebox.utils.logger import get_logger
+
+logger = get_logger(name=__name__)
+
+# ---------------------------------------------------------------------------
+# Output schema — derived from StorageExplorationSummary
+# ---------------------------------------------------------------------------
+
+STORAGE_EXPLORATION_OUTPUT_SCHEMA: dict[str, Any] = {
+    "type": "object",
+    "properties": {
+        "total_events": {
+            "type": "integer",
+            "description": "Total number of storage + window property events in the capture.",
+        },
+        "noise_filtered": {
+            "type": "integer",
+            "description": "Events discarded as noise (tracking cookies, analytics IDs, consent flags, etc.).",
+        },
+        "tokens": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": (
+                "Each entry is a freeform description of one discovered token. "
+                "Include: where it lives (cookie/localStorage/sessionStorage/window property), "
+                "the key name, what kind of token it looks like (JWT, session ID, "
+                "CSRF, API key), rough size, and whether it changed during the session."
+            ),
+        },
+        "data_blocks": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": (
+                "Each entry is a freeform description of one meaningful data block. "
+                "Include: where it lives, the key name, what it contains, rough size."
+            ),
+        },
+        "narrative": {
+            "type": "string",
+            "description": (
+                "Freeform observations: auth patterns, storage lifecycle, "
+                "oddities, cross-domain connections, anything else worth noting."
+            ),
+        },
+    },
+    "required": [
+        "total_events",
+        "noise_filtered",
+        "tokens",
+        "data_blocks",
+        "narrative",
+    ],
+}
+
+# ---------------------------------------------------------------------------
+# Autonomous system prompt override — exploration-specific
+# ---------------------------------------------------------------------------
+
+EXPLORATION_SYSTEM_PROMPT = """
+You are a storage & window property exploration agent. Your job is to survey ALL
+browser storage (cookies, localStorage, sessionStorage, IndexedDB) and window
+object properties to find tokens and data blocks.
+
+## Your Mission
+
+Scan everything in storage and window properties. Find two things:
+
+1. **Tokens** — auth tokens (JWTs, session IDs), CSRF tokens, API keys, refresh
+   tokens, OAuth codes. Anything an API endpoint might need to function.
+2. **Data blocks** — large or structured values: cached API responses, user
+   profiles, config objects, feature flags, search results.
+
+Everything else (tracking cookies, analytics IDs, consent flags, ad pixels,
+UI preferences) is noise — count it but don't catalog it.
+
+## Process
+
+1. **Survey**: Use `search_in_storage`, `get_storage_by_key`, `search_in_window_props`,
+   and `get_window_prop_changes` to map high-signal keys/paths and values.
+2. **Identify tokens**: Look for values that look like auth material:
+   - JWTs (three dot-separated base64 segments)
+   - Session IDs (long opaque strings, often hex or base64)
+   - CSRF/XSRF tokens (often in cookies or headers)
+   - API keys (prefixed strings like `sk_live_...`, `pk_...`, `dk_...`)
+   - OAuth tokens (access_token, refresh_token keys)
+   Use `get_storage_by_key` or `search_in_storage` to inspect promising keys.
+3. **Identify data blocks**: Look for large JSON values, objects with multiple
+   keys, arrays of data. Use `get_storage_entry` to inspect them.
+4. **Check window properties**: Use `search_in_window_props` or
+   `get_window_prop_changes` to find tokens or config stashed on the window object.
+5. **Count noise**: Tally up tracking cookies, analytics, consent, etc.
+6. **Finalize**: Call `finalize_with_output(output={...})` with the COMPLETE JSON.
+
+## CRITICAL: How to Finalize
+
+When you call `finalize_with_output`, you MUST pass the ENTIRE output as a single
+JSON object in the `output` parameter. Do NOT call it empty or with partial data.
+
+```
+finalize_with_output(output={
+  "total_events": 45,
+  "noise_filtered": 30,
+  "tokens": [
+    "sessionStorage['auth_token'] — JWT (~1.2kb), written once on page load, three dot-separated base64 segments",
+    "cookie['XSRF-TOKEN'] — CSRF token (~64 chars hex), set by server on every response"
+  ],
+  "data_blocks": [
+    "localStorage['user_profile'] — JSON object (~2kb) with keys: name, email, prefs, subscription"
+  ],
+  "narrative": "Auth uses JWT in sessionStorage + CSRF cookie. Token set on initial page load, never refreshed during session."
+})
+```
+
+## Guidelines
+
+- Be thorough — check ALL storage keys and window property paths, not just the first few.
+- For tokens: describe what it IS, not just where it is. "Looks like a JWT" is useful.
+  "Key exists" is not.
+- For data blocks: describe the SHAPE (top-level keys, rough size), not the raw content.
+- The narrative is your chance to tell the story: how does auth work? What gets
+  cached? What's the lifecycle? Any surprises?
+- Don't catalog noise items individually — just count them.
+""".strip()
+
+
+def _emit_message(msg: EmittedMessage) -> None:
+    """Print emitted messages to stderr for visibility."""
+    if hasattr(msg, "content") and msg.content:
+        print(f"[agent] {msg.content}", file=sys.stderr)
+    elif hasattr(msg, "error") and msg.error:
+        print(f"[error] {msg.error}", file=sys.stderr)
+
+
+def _resolve_model(model_str: str) -> LLMModel:
+    """Resolve a model string to an LLMModel enum value."""
+    for member in OpenAIModel:
+        if member.value == model_str:
+            return member
+    raise ValueError(
+        f"Unknown model: {model_str}. "
+        f"Available: {[m.value for m in OpenAIModel]}"
+    )
+
+
+def run_storage_exploration(
+    cdp_captures_dir: Path,
+    llm_model: LLMModel = OpenAIModel.GPT_5_1,
+    min_iterations: int = 3,
+    max_iterations: int = 15,
+    workspace_dir: Path | None = None,
+) -> StorageExplorationSummary | None:
+    """
+    Run storage exploration on a CDP captures directory.
+
+    Args:
+        cdp_captures_dir: Path to directory containing storage/events.jsonl
+            and/or window_properties/events.jsonl.
+        llm_model: LLM model to use.
+        min_iterations: Minimum iterations before finalize is available.
+        max_iterations: Maximum iterations before the loop exits.
+        workspace_dir: Workspace directory for artifacts and mounted inputs.
+
+    Returns:
+        StorageExplorationSummary if successful, None if the agent failed or timed out.
+    """
+    # Try to load each data source — at least one of storage or window_properties must exist
+    storage_loader: StorageDataLoader | None = None
+    window_loader: WindowPropertyDataLoader | None = None
+    network_loader: NetworkDataLoader | None = None
+
+    storage_jsonl = cdp_captures_dir / "storage" / "events.jsonl"
+    if storage_jsonl.exists():
+        storage_loader = StorageDataLoader(jsonl_path=str(storage_jsonl))
+        logger.info(
+            "Loaded %d storage events (%d cookie, %d localStorage, %d sessionStorage)",
+            storage_loader.stats.total_events,
+            storage_loader.stats.cookie_events,
+            storage_loader.stats.local_storage_events,
+            storage_loader.stats.session_storage_events,
+        )
+
+    window_jsonl = cdp_captures_dir / "window_properties" / "events.jsonl"
+    if window_jsonl.exists():
+        window_loader = WindowPropertyDataLoader(jsonl_path=str(window_jsonl))
+        logger.info(
+            "Loaded %d window property events (%d unique paths)",
+            window_loader.stats.total_events,
+            window_loader.stats.unique_property_paths,
+        )
+
+    # Network is optional — if present, the agent can cross-reference tokens with requests
+    network_jsonl = cdp_captures_dir / "network" / "events.jsonl"
+    if network_jsonl.exists():
+        network_loader = NetworkDataLoader(jsonl_path=str(network_jsonl))
+        logger.info(
+            "Loaded %d network entries (optional, for cross-referencing)",
+            network_loader.stats.total_requests,
+        )
+
+    if storage_loader is None and window_loader is None:
+        logger.error(
+            "Neither storage/events.jsonl nor window_properties/events.jsonl found in %s",
+            cdp_captures_dir,
+        )
+        return None
+
+    workspace = LocalAgentWorkspace.from_directory_path(
+        workspace_dir or Path("./agent_workspace/storage_exploration"),
+    )
+    if storage_jsonl.exists():
+        workspace.attach_input_file("storage_events", storage_jsonl)
+    if window_jsonl.exists():
+        workspace.attach_input_file("window_property_events", window_jsonl)
+    if network_jsonl.exists():
+        workspace.attach_input_file("network_events", network_jsonl)
+
+    # Build the specialist
+    specialist = ValueTraceResolverSpecialist(
+        emit_message_callable=_emit_message,
+        storage_data_loader=storage_loader,
+        window_property_data_loader=window_loader,
+        network_data_loader=network_loader,
+        enable_execute_python=False,
+        llm_model=llm_model,
+        workspace=workspace,
+    )
+
+    # Bump max_output_tokens for the finalize call
+    specialist.llm_client._client.DEFAULT_MAX_TOKENS = 16_384
+
+    # Monkey-patch the autonomous system prompt for exploration
+    def _exploration_system_prompt() -> str:
+        context_parts: list[str] = [EXPLORATION_SYSTEM_PROMPT, "\n\n## Data Context"]
+
+        if storage_loader:
+            stats = storage_loader.stats
+            context_parts.append(
+                f"\n- Storage: {stats.total_events} events "
+                f"(cookies: {stats.cookie_events}, localStorage: {stats.local_storage_events}, "
+                f"sessionStorage: {stats.session_storage_events}, "
+                f"IndexedDB: {stats.indexed_db_events}), "
+                f"{stats.unique_origins} origins, {stats.unique_keys} unique keys"
+            )
+        else:
+            context_parts.append("\n- Storage: Not available")
+
+        if window_loader:
+            stats = window_loader.stats
+            context_parts.append(
+                f"\n- Window Props: {stats.total_events} events, "
+                f"{stats.unique_property_paths} unique paths"
+            )
+        else:
+            context_parts.append("\n- Window Props: Not available")
+
+        if network_loader:
+            stats = network_loader.stats
+            context_parts.append(
+                f"\n- Network (optional): {stats.total_requests} requests, "
+                f"{stats.unique_urls} unique URLs (available for cross-referencing)"
+            )
+
+        return (
+            "".join(context_parts)
+            + specialist._get_output_schema_prompt_section()
+            + specialist._get_urgency_notice()
+        )
+
+    specialist._get_autonomous_system_prompt = _exploration_system_prompt  # type: ignore[assignment]
+
+    # Override initial message too, so we don't inherit trace-by-single-value framing.
+    def _exploration_initial_message(task_text: str) -> str:
+        return (
+            f"STORAGE EXPLORATION TASK: {task_text}\n\n"
+            "This is broad exploration, not single-value tracing. Survey all available "
+            "storage and window property data, identify tokens and meaningful data blocks, "
+            "count noise, and finalize with a complete structured output."
+        )
+
+    specialist._get_autonomous_initial_message = _exploration_initial_message  # type: ignore[assignment]
+
+    # Run autonomous exploration
+    task = (
+        "Explore ALL storage and window property data in this capture. "
+        "Find tokens (JWTs, session IDs, CSRF tokens, API keys) and data blocks "
+        "(large JSON objects, cached responses, config). Count the noise. "
+        "Then call finalize_with_output(output={...}) with a COMPLETE JSON object containing: "
+        "total_events (int), noise_filtered (int), tokens (array of strings), "
+        "data_blocks (array of strings), narrative (string)."
+    )
+
+    config = AutonomousRunConfig(
+        min_iterations=min_iterations,
+        max_iterations=max_iterations,
+    )
+
+    result = specialist.run_autonomous(
+        task=task,
+        config=config,
+        output_schema=STORAGE_EXPLORATION_OUTPUT_SCHEMA,
+        output_description=(
+            "A StorageExplorationSummary with all discovered tokens and data blocks, "
+            "noise count, and a narrative of findings."
+        ),
+    )
+
+    if result is None:
+        logger.warning("Storage exploration did not produce a result (timed out or failed)")
+        return None
+
+    if not result.success:
+        logger.warning("Storage exploration failed: %s", result.failure_reason)
+        return None
+
+    # Parse into our Pydantic model
+    try:
+        summary = StorageExplorationSummary(**result.output)
+        logger.info(
+            "Storage exploration complete: %d tokens, %d data blocks discovered",
+            len(summary.tokens),
+            len(summary.data_blocks),
+        )
+        return summary
+    except Exception as e:
+        logger.error("Failed to parse exploration output: %s", e)
+        logger.debug("Raw output: %s", result.output)
+        return None
+
+
+def main() -> None:
+    """CLI entrypoint for storage exploration."""
+    parser = argparse.ArgumentParser(
+        description="Run Phase 1 Storage Exploration on CDP captures",
+    )
+    parser.add_argument(
+        "--cdp-captures-dir",
+        type=Path,
+        required=True,
+        help="Path to CDP captures directory (expects storage/events.jsonl and/or window_properties/events.jsonl inside)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="gpt-5.1",
+        help="LLM model to use (default: gpt-5.1)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("exploration_storage.json"),
+        help="Path to write output JSON (default: exploration_storage.json)",
+    )
+    parser.add_argument(
+        "--workspace-dir",
+        type=Path,
+        default=Path("./agent_workspace/storage_exploration"),
+        help="Workspace directory for artifacts and mounted inputs.",
+    )
+    parser.add_argument(
+        "--min-iterations",
+        type=int,
+        default=3,
+        help="Minimum iterations before finalize is available (default: 3)",
+    )
+    parser.add_argument(
+        "--max-iterations",
+        type=int,
+        default=15,
+        help="Maximum iterations before the loop exits (default: 15)",
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Enable verbose logging",
+    )
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    if not args.cdp_captures_dir.exists():
+        print(f"Error: {args.cdp_captures_dir} does not exist", file=sys.stderr)
+        sys.exit(1)
+
+    llm_model = _resolve_model(args.model)
+
+    summary = run_storage_exploration(
+        cdp_captures_dir=args.cdp_captures_dir,
+        llm_model=llm_model,
+        min_iterations=args.min_iterations,
+        max_iterations=args.max_iterations,
+        workspace_dir=args.workspace_dir,
+    )
+
+    if summary is None:
+        print("Exploration failed — no output produced.", file=sys.stderr)
+        sys.exit(1)
+
+    output_json = summary.model_dump_json(indent=2)
+
+    if args.output:
+        args.output.write_text(output_json)
+        print(f"Exploration summary written to {args.output}", file=sys.stderr)
+    else:
+        print(output_json)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bluebox/scripts/api_indexing/run_ui_exploration.py b/bluebox/scripts/api_indexing/run_ui_exploration.py
new file mode 100644
index 00000000..ad3d32e5
--- /dev/null
+++ b/bluebox/scripts/api_indexing/run_ui_exploration.py
@@ -0,0 +1,396 @@
+"""
+bluebox/scripts/api_indexing/run_ui_exploration.py
+
+Prototype script for Phase 1 (Exploration) — UI / Interaction domain.
+
+Runs the InteractionSpecialist in autonomous mode with an output schema
+matching UIExplorationSummary. The agent uses interaction tools (form inputs,
+clicks, elements) and optionally DOM tools (pages, forms, inputs) to
+understand what the user did and infer their intent.
+
+Usage:
+    python -m bluebox.scripts.api_indexing.run_ui_exploration --cdp-captures-dir ./cdp_captures
+    python -m bluebox.scripts.api_indexing.run_ui_exploration --cdp-captures-dir ./cdp_captures --model gpt-5.1
+    python -m bluebox.scripts.api_indexing.run_ui_exploration --cdp-captures-dir ./cdp_captures --output /tmp/exploration_ui.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Any
+
+from bluebox.agents.abstract_agent import AutonomousRunConfig
+from bluebox.agents.specialists.interaction_specialist import InteractionSpecialist
+from bluebox.workspace import LocalAgentWorkspace
+from bluebox.data_models.api_indexing.exploration import UIExplorationSummary
+from bluebox.data_models.llms.interaction import EmittedMessage
+from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel
+from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader
+from bluebox.llms.data_loaders.interactions_data_loader import InteractionsDataLoader
+from bluebox.utils.logger import get_logger
+
+logger = get_logger(name=__name__)
+
+# ---------------------------------------------------------------------------
+# Output schema — derived from UIExplorationSummary
+# ---------------------------------------------------------------------------
+
+UI_EXPLORATION_OUTPUT_SCHEMA: dict[str, Any] = {
+    "type": "object",
+    "properties": {
+        "total_events": {
+            "type": "integer",
+            "description": "Total number of interaction events in the capture.",
+        },
+        "user_inputs": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": "Each entry describes one user input: element, value, type.",
+        },
+        "clicks": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": "Each entry describes one meaningful click: element, action, page.",
+        },
+        "navigation_flow": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": "Ordered page transitions the user made.",
+        },
+        "inferred_intent": {
+            "type": "string",
+            "description": "What the user was trying to accomplish.",
+        },
+        "narrative": {
+            "type": "string",
+            "description": "Freeform observations about user behavior and interaction patterns.",
+        },
+    },
+    "required": [
+        "total_events",
+        "user_inputs",
+        "clicks",
+        "navigation_flow",
+        "inferred_intent",
+        "narrative",
+    ],
+}
+
+# ---------------------------------------------------------------------------
+# Autonomous system prompt — UI exploration
+# ---------------------------------------------------------------------------
+
+EXPLORATION_SYSTEM_PROMPT = """
+You are a UI interaction exploration agent. Your job is to analyze recorded
+browser interaction events to understand what the user did on the page.
+
+## Your Mission
+
+1. **User inputs** — what did the user type, select, or fill in?
+2. **Clicks** — what buttons, links, or elements did the user click?
+3. **Navigation flow** — what pages did the user visit and in what order?
+4. **Intent** — what was the user trying to accomplish?
+
+## Process
+
+1. `get_interaction_summary` -> overview of all events
+2. `get_form_inputs` -> what the user typed/selected in forms
+3. `get_unique_elements` -> which elements were interacted with
+4. `search_interactions_by_type(types=["click"])` -> button/link clicks
+5. If DOM tools available: `list_pages` + `get_forms` for page context
+6. `finalize_with_output(output={...})` with the COMPLETE JSON
+
+## How to Finalize
+
+Pass the ENTIRE output as a single JSON object. Example:
+
+```
+finalize_with_output(output={
+  "total_events": 42,
+  "user_inputs": [
+    "input#origin (text) -- user typed 'LAX'",
+    "input#destination (text) -- user typed 'JFK'",
+    "input#date (date) -- user selected '2025-03-15'"
+  ],
+  "clicks": [
+    "button.search-btn 'Search Flights' -- submits search form",
+    "a.result-link 'Select Flight' -- picks first result"
+  ],
+  "navigation_flow": [
+    "/ (Home) -> /book/flights (Search)",
+    "/book/flights (Search) -> /results (Results)"
+  ],
+  "inferred_intent": "Search for one-way flights from LAX to JFK on 2025-03-15",
+  "narrative": "User navigated to flight search, filled origin/destination/date, clicked search..."
+})
+```
+
+## Guidelines
+
+- Focus on meaningful interactions — ignore scroll, hover, focus/blur noise.
+- For inputs, include the actual value the user typed/selected.
+- For clicks, include the button/link text or identifier.
+- Keep descriptions to one sentence each.
+- The inferred intent should be a single clear sentence.
+""".strip()
+
+
+def _emit_message(msg: EmittedMessage) -> None:
+    """Print emitted messages to stderr for visibility."""
+    if hasattr(msg, "content") and msg.content:
+        print(f"[agent] {msg.content}", file=sys.stderr)
+    elif hasattr(msg, "error") and msg.error:
+        print(f"[error] {msg.error}", file=sys.stderr)
+
+
+def _resolve_model(model_str: str) -> LLMModel:
+    """Resolve a model string to an LLMModel enum value."""
+    for member in OpenAIModel:
+        if member.value == model_str:
+            return member
+    raise ValueError(
+        f"Unknown model: {model_str}. "
+        f"Available: {[m.value for m in OpenAIModel]}"
+    )
+
+
+def run_ui_exploration(
+    cdp_captures_dir: Path,
+    llm_model: LLMModel = OpenAIModel.GPT_5_1,
+    min_iterations: int = 3,
+    max_iterations: int = 15,
+    workspace_dir: Path | None = None,
+) -> UIExplorationSummary | None:
+    """
+    Run UI/interaction exploration on a CDP captures directory.
+
+    Requires interaction data. Optionally loads DOM data for structural context.
+
+    Args:
+        cdp_captures_dir: Path to directory containing interactions/events.jsonl.
+        llm_model: LLM model to use.
+        min_iterations: Minimum iterations before finalize is available.
+        max_iterations: Maximum iterations before the loop exits.
+        workspace_dir: Workspace directory for artifacts and mounted inputs.
+
+    Returns:
+        UIExplorationSummary if successful, None if the agent failed or timed out.
+    """
+    # Support both "interactions/" and "interaction/" directory names
+    interaction_jsonl = cdp_captures_dir / "interactions" / "events.jsonl"
+    if not interaction_jsonl.exists():
+        interaction_jsonl = cdp_captures_dir / "interaction" / "events.jsonl"
+    if not interaction_jsonl.exists():
+        logger.error("interactions/events.jsonl not found in %s", cdp_captures_dir)
+        return None
+
+    # Load interaction data (required)
+    interaction_loader = InteractionsDataLoader.from_jsonl(str(interaction_jsonl))
+    logger.info(
+        "Loaded %d interaction events (%d unique elements)",
+        interaction_loader.stats.total_events,
+        interaction_loader.stats.unique_elements,
+    )
+
+    # Try to load DOM data (optional, for structural context)
+    dom_loader: DOMDataLoader | None = None
+    dom_jsonl = cdp_captures_dir / "dom" / "events.jsonl"
+    if dom_jsonl.exists():
+        dom_loader = DOMDataLoader(jsonl_path=str(dom_jsonl))
+        logger.info(
+            "Loaded %d DOM snapshots for structural context",
+            dom_loader.stats.total_snapshots,
+        )
+    else:
+        logger.info("No DOM data found — will use interaction events only")
+
+    workspace = LocalAgentWorkspace.from_directory_path(
+        workspace_dir or Path("./agent_workspace/ui_exploration"),
+    )
+    workspace.attach_input_file("interaction_events", interaction_jsonl)
+    if dom_jsonl.exists():
+        workspace.attach_input_file("dom_events", dom_jsonl)
+
+    specialist = InteractionSpecialist(
+        emit_message_callable=_emit_message,
+        interaction_data_loader=interaction_loader,
+        dom_data_loader=dom_loader,
+        llm_model=llm_model,
+        workspace=workspace,
+    )
+
+    # Bump max_output_tokens for the finalize call
+    specialist.llm_client._client.DEFAULT_MAX_TOKENS = 16_384
+
+    # Monkey-patch the autonomous system prompt for exploration
+    def _exploration_system_prompt() -> str:
+        i_stats = interaction_loader.stats
+        context_parts: list[str] = [
+            EXPLORATION_SYSTEM_PROMPT,
+            f"\n\n## Interaction Data Context\n"
+            f"- Events: {i_stats.total_events}\n"
+            f"- Unique URLs: {i_stats.unique_urls}\n"
+            f"- Unique Elements: {i_stats.unique_elements}\n"
+            f"- Events by Type: {i_stats.events_by_type}",
+        ]
+
+        if dom_loader:
+            d_stats = dom_loader.stats
+            context_parts.append(
+                f"\n\n## DOM Data (for context)\n"
+                f"- Snapshots: {d_stats.total_snapshots}\n"
+                f"- Unique URLs: {d_stats.unique_urls}\n"
+                f"- Use DOM tools (list_pages, get_forms) for structural context."
+            )
+
+        return (
+            "".join(context_parts)
+            + specialist._get_output_schema_prompt_section()
+            + specialist._get_urgency_notice()
+        )
+
+    specialist._get_autonomous_system_prompt = _exploration_system_prompt  # type: ignore[assignment]
+
+    # Override initial message so exploration framing is explicit.
+    def _exploration_initial_message(task_text: str) -> str:
+        return (
+            f"UI EXPLORATION TASK: {task_text}\n\n"
+            "This is broad interaction exploration. Reconstruct inputs, clicks, "
+            "navigation flow, and user intent across all events, then finalize with "
+            "the complete structured output."
+        )
+
+    specialist._get_autonomous_initial_message = _exploration_initial_message  # type: ignore[assignment]
+
+    # Build task message
+    task = (
+        "Analyze ALL interaction events in this capture. Discover what the user "
+        "typed, clicked, and selected. Map the navigation flow. Infer what the "
+        f"user was trying to accomplish. There are {interaction_loader.stats.total_events} "
+        "interaction events. "
+        "Then call finalize_with_output(output={...}) with the COMPLETE JSON."
+    )
+
+    config = AutonomousRunConfig(
+        min_iterations=min_iterations,
+        max_iterations=max_iterations,
+    )
+
+    result = specialist.run_autonomous(
+        task=task,
+        config=config,
+        output_schema=UI_EXPLORATION_OUTPUT_SCHEMA,
+        output_description=(
+            "A UIExplorationSummary with user inputs, clicks, navigation flow, "
+            "inferred intent, and narrative."
+        ),
+    )
+
+    if result is None:
+        logger.warning("UI exploration did not produce a result (timed out or failed)")
+        return None
+
+    if not result.success:
+        logger.warning("UI exploration failed: %s", result.failure_reason)
+        return None
+
+    # Parse into our Pydantic model
+    try:
+        summary = UIExplorationSummary(**result.output)
+        logger.info(
+            "UI exploration complete: %d inputs, %d clicks, intent: %s",
+            len(summary.user_inputs),
+            len(summary.clicks),
+            summary.inferred_intent[:80] if summary.inferred_intent else "(none)",
+        )
+        return summary
+    except Exception as e:
+        logger.error("Failed to parse exploration output: %s", e)
+        logger.debug("Raw output: %s", result.output)
+        return None
+
+
+def main() -> None:
+    """CLI entrypoint for UI exploration."""
+    parser = argparse.ArgumentParser(
+        description="Run Phase 1 UI/Interaction Exploration on CDP captures",
+    )
+    parser.add_argument(
+        "--cdp-captures-dir",
+        type=Path,
+        required=True,
+        help="Path to CDP captures directory (expects interactions/events.jsonl inside)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="gpt-5.1",
+        help="LLM model to use (default: gpt-5.1)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("exploration_ui.json"),
+        help="Path to write output JSON (default: exploration_ui.json)",
+    )
+    parser.add_argument(
+        "--workspace-dir",
+        type=Path,
+        default=Path("./agent_workspace/ui_exploration"),
+        help="Workspace directory for artifacts and mounted inputs.",
+    )
+    parser.add_argument(
+        "--min-iterations",
+        type=int,
+        default=3,
+        help="Minimum iterations before finalize is available (default: 3)",
+    )
+    parser.add_argument(
+        "--max-iterations",
+        type=int,
+        default=15,
+        help="Maximum iterations before the loop exits (default: 15)",
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Enable verbose logging",
+    )
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    if not args.cdp_captures_dir.exists():
+        print(f"Error: {args.cdp_captures_dir} does not exist", file=sys.stderr)
+        sys.exit(1)
+
+    llm_model = _resolve_model(args.model)
+
+    summary = run_ui_exploration(
+        cdp_captures_dir=args.cdp_captures_dir,
+        llm_model=llm_model,
+        min_iterations=args.min_iterations,
+        max_iterations=args.max_iterations,
+        workspace_dir=args.workspace_dir,
+    )
+
+    if summary is None:
+        print("UI exploration failed — no output produced.", file=sys.stderr)
+        sys.exit(1)
+
+    output_json = summary.model_dump_json(indent=2)
+
+    if args.output:
+        args.output.write_text(output_json)
+        print(f"UI exploration summary written to {args.output}", file=sys.stderr)
+    else:
+        print(output_json)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bluebox/scripts/run_bluebox_agent.py b/bluebox/scripts/run_bluebox_agent.py
index 093a4667..d0f5832b 100644
--- a/bluebox/scripts/run_bluebox_agent.py
+++ b/bluebox/scripts/run_bluebox_agent.py
@@ -36,7 +36,7 @@
 from textual.widgets import RichLog
 
 from bluebox.agents.bluebox_agent import BlueBoxAgent
-from bluebox.agents.workspace import LocalWorkspace
+from bluebox.workspace import LocalAgentWorkspace
 from bluebox.data_models.agents.context import BlueBoxAgentContext
 from bluebox.config import Config
 from bluebox.data_models.llms.vendors import LLMModel
@@ -65,12 +65,13 @@ class BlueBoxAgentTUI(AbstractAgentTUI):
     def __init__(
         self,
         llm_model: LLMModel,
-        workspace_dir: str = "./bluebox_workspace",
+        workspace_dir: str = "./agent_workspace/bluebox_agent",
         context_file: str | None = None,
     ) -> None:
         super().__init__(llm_model, working_dir=workspace_dir)
         self._workspace_dir = workspace_dir
         self._context_file = context_file
+        self._workspace = LocalAgentWorkspace.from_directory_path(self._workspace_dir)
 
     # ── Abstract implementations ─────────────────────────────────────────
 
@@ -79,7 +80,7 @@ def _create_agent(self) -> AbstractAgent:
             emit_message_callable=self._handle_message,
             stream_chunk_callable=self._handle_stream_chunk,
             llm_model=self._llm_model,
-            workspace=LocalWorkspace(self._workspace_dir),
+            workspace=self._workspace,
             context_file=self._context_file,
         )
 
@@ -101,12 +102,6 @@ def _print_welcome(self) -> None:
         chat.write(Text.from_markup("\n".join(lines)))
         chat.write("")
 
-        chat.write(Text.from_markup(
-            "Type [cyan]/help[/cyan] for commands, or ask me to browse the web "
-            "or execute a routine."
-        ))
-        chat.write("")
-
     def _build_status_text(self) -> str:
         now = datetime.now().strftime("%Y-%m-%d %H:%M")
         msg_count = len(self._agent.get_chats()) if self._agent else 0
@@ -175,7 +170,7 @@ def _generate_context_async(self, focus: str | None) -> None:
 
     def _show_context_success(self, context: BlueBoxAgentContext) -> None:
         """Display context generation success in the chat pane."""
-        context_dir = Path(self._workspace_dir) / "context"
+        context_dir = self._workspace.root_path / "context"
         json_path = str(context_dir / "agent_context.json")
         md_path = str(context_dir / "agent_context.md")
         chat = self.query_one("#chat-log", RichLog)
@@ -209,8 +204,8 @@ def main() -> None:
     parser.add_argument(
         "--workspace-dir",
         type=str,
-        default="./bluebox_workspace",
-        help="Workspace directory. Raw results in raw/, output files in outputs/ (default: ./bluebox_workspace)",
+        default="./agent_workspace/bluebox_agent",
+        help="Workspace directory. Raw results in raw/, output files in output/ (default: ./agent_workspace/bluebox_agent)",
     )
     parser.add_argument(
         "--context-file",
diff --git a/bluebox/scripts/run_routine_discovery_agent_beta.py b/bluebox/scripts/run_routine_discovery_agent_beta.py
deleted file mode 100644
index 8814a7a4..00000000
--- a/bluebox/scripts/run_routine_discovery_agent_beta.py
+++ /dev/null
@@ -1,854 +0,0 @@
-"""
-bluebox/scripts/run_routine_discovery_agent_beta.py
-
-Multi-pane terminal UI for the RoutineDiscoveryAgentBeta using Textual.
-
-Layout:
-  +-----------------------------+----------------------+
-  |                             |  Tool Calls History   |
-  |       Chat (scrolling)      |                       |
-  |                             +-----------------------+
-  |  +------------------------+ |  Saved Files          |
-  |  | Input                  | |                       |
-  |  +------------------------+ |                       |
-  +-----------------------------+-----------------------+
-
-Usage:
-    bluebox-routine-discovery-agent-beta --cdp-captures-dir ./cdp_captures
-    bluebox-routine-discovery-agent-beta --cdp-captures-dir ./cdp_captures --task "Search for trains from NYC to Boston"
-    bluebox-routine-discovery-agent-beta --cdp-captures-dir ./cdp_captures --model gpt-5.2
-    bluebox-routine-discovery-agent-beta --cdp-captures-dir ./cdp_captures --remote-debugging-address http://127.0.0.1:9222
-"""
-
-from __future__ import annotations
-
-import argparse
-import hashlib
-import json
-import shutil
-import sys
-from datetime import datetime
-from pathlib import Path
-from typing import TYPE_CHECKING, Any
-
-from rich.console import Console
-from rich.markup import escape
-from rich.text import Text
-from textual import work
-from textual.widgets import RichLog
-
-from bluebox.agents.routine_discovery_agent_beta import RoutineDiscoveryAgentBeta
-from bluebox.config import Config
-from bluebox.data_models.llms.interaction import BaseEmittedMessage
-from bluebox.data_models.llms.vendors import LLMModel
-from bluebox.data_models.routine.routine import Routine
-from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader
-from bluebox.llms.data_loaders.interactions_data_loader import InteractionsDataLoader
-from bluebox.llms.data_loaders.js_data_loader import JSDataLoader
-from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
-from bluebox.llms.data_loaders.storage_data_loader import StorageDataLoader
-from bluebox.llms.data_loaders.window_property_data_loader import WindowPropertyDataLoader
-from bluebox.llms.tools.execute_routine_tool import execute_routine
-from bluebox.utils.cli_utils import add_model_argument, resolve_model
-from bluebox.utils.logger import enable_tui_logging, get_logger
-from bluebox.utils.tui_base import AbstractAgentTUI, BASE_SLASH_COMMANDS
-
-if TYPE_CHECKING:
-    from bluebox.agents.abstract_agent import AbstractAgent
-
-
-# Package root for documentation code_paths
-BLUEBOX_PACKAGE_ROOT = Path(__file__).resolve().parent.parent
-
-logger = get_logger(__name__)
-
-
-# ─── Slash commands ──────────────────────────────────────────────────────────
-
-SLASH_COMMANDS: dict[str, str] = {
-    "/discover": "Start routine discovery — /discover <task>",
-    "/execute": "Execute the discovered routine",
-    "/routine": "Show the discovered routine",
-    "/save": "Save routine — /save <path.json>",
-    **BASE_SLASH_COMMANDS,
-}
-
-HELP_TEXT = """\
-[bold]Commands:[/bold]
-  [cyan]/discover <task>[/cyan]  Start routine discovery for the given task
-  [cyan]/execute[/cyan]          Execute the discovered routine
-  [cyan]/routine[/cyan]          Show the discovered routine
-  [cyan]/save <path>[/cyan]      Save routine to file
-  [cyan]/status[/cyan]           Show current state
-  [cyan]/chats[/cyan]            Show message history
-  [cyan]/clear[/cyan]            Clear the chat display
-  [cyan]/reset[/cyan]            Start new conversation
-  [cyan]/help[/cyan]             Show this help
-  [cyan]/quit[/cyan]             Exit
-"""
-
-
-# ─── Textual App ─────────────────────────────────────────────────────────────
-
-class RotutineDiscoveryBetaTUI(AbstractAgentTUI):
-    """Multi-pane TUI for the Routine Discovery Beta Agent."""
-
-    TITLE = "Routine Discovery Beta"
-    SLASH_COMMANDS = SLASH_COMMANDS
-    HELP_TEXT = HELP_TEXT
-    SHOW_SAVED_FILES_PANE = True
-
-    def __init__(
-        self,
-        llm_model: LLMModel,
-        network_data_loader: NetworkDataLoader,
-        storage_data_loader: StorageDataLoader | None = None,
-        window_property_data_loader: WindowPropertyDataLoader | None = None,
-        js_data_loader: JSDataLoader | None = None,
-        interaction_data_loader: InteractionsDataLoader | None = None,
-        documentation_data_loader: DocumentationDataLoader | None = None,
-        subagent_llm_model: LLMModel | None = None,
-        max_iterations: int = 50,
-        remote_debugging_address: str | None = None,
-        output_dir: Path | None = None,
-        initial_task: str | None = None,
-    ) -> None:
-        output = output_dir or Path("./routine_discovery_output")
-        super().__init__(llm_model, working_dir=str(output))
-        self._network_data_loader = network_data_loader
-        self._storage_data_loader = storage_data_loader
-        self._window_property_data_loader = window_property_data_loader
-        self._js_data_loader = js_data_loader
-        self._interaction_data_loader = interaction_data_loader
-        self._documentation_data_loader = documentation_data_loader
-        self._subagent_llm_model = subagent_llm_model
-        self._max_iterations = max_iterations
-        self._remote_debugging_address = remote_debugging_address
-        self._output_dir = output
-        self._initial_task = initial_task
-
-        # Discovery state
-        self._discovery_agent: RoutineDiscoveryAgentBeta | None = None
-        self._discovered_routine: Routine | None = None
-        self._is_discovering: bool = False
-        self._last_state_hash: str | None = None
-
-    # ── Abstract implementations ─────────────────────────────────────────
-
-    def _create_agent(self) -> AbstractAgent:
-        """Create the chat-mode agent (placeholder task for conversational use)."""
-        return RoutineDiscoveryAgentBeta(
-            emit_message_callable=self._handle_message,
-            stream_chunk_callable=self._handle_stream_chunk,
-            network_data_loader=self._network_data_loader,
-            task="Help the user understand their CDP captures and answer questions about routine discovery.",
-            storage_data_loader=self._storage_data_loader,
-            window_property_data_loader=self._window_property_data_loader,
-            js_data_loader=self._js_data_loader,
-            interaction_data_loader=self._interaction_data_loader,
-            documentation_data_loader=self._documentation_data_loader,
-            llm_model=self._llm_model,
-            subagent_llm_model=self._subagent_llm_model,
-            max_iterations=self._max_iterations,
-            remote_debugging_address=self._remote_debugging_address,
-        )
-
-    def _print_welcome(self) -> None:
-        chat = self.query_one("#chat-log", RichLog)
-        chat.write(Text.from_markup(
-            "[bold cyan]Routine Discovery Beta Agent[/bold cyan]  "
-            "[dim]powered by Vectorly[/dim]"
-        ))
-        chat.write("")
-
-        # Data summary
-        lines: list[str] = []
-        stats = self._network_data_loader.stats
-        lines.append(f"[dim]Network:[/dim]     {stats.total_requests} transactions, {stats.unique_hosts} hosts")
-        if self._storage_data_loader:
-            lines.append(f"[dim]Storage:[/dim]     {self._storage_data_loader.stats.total_events} events")
-        if self._window_property_data_loader:
-            lines.append(f"[dim]Window:[/dim]      {self._window_property_data_loader.stats.total_events} events")
-        if self._js_data_loader:
-            lines.append(f"[dim]JS Files:[/dim]    {self._js_data_loader.stats.total_files} files")
-        if self._interaction_data_loader:
-            lines.append(f"[dim]Interactions:[/dim] {self._interaction_data_loader.stats.total_events} events")
-        if self._remote_debugging_address:
-            lines.append(f"[dim]Browser:[/dim]     [green]{self._remote_debugging_address}[/green]")
-
-        chat.write(Text.from_markup("\n".join(lines)))
-        chat.write("")
-        chat.write(Text.from_markup(
-            "Type [cyan]/discover <task>[/cyan] to start discovery, "
-            "or ask questions about the captured traffic."
-        ))
-        chat.write("")
-
-    def _build_status_text(self) -> str:
-        now = datetime.now().strftime("%Y-%m-%d %H:%M")
-        msg_count = len(self._agent.get_chats()) if self._agent else 0
-        tokens_used, ctx_pct = self._estimate_context_usage()
-        ctx_bar = self._context_bar(ctx_pct)
-        stats = self._network_data_loader.stats
-
-        phase = "N/A"
-        if self._discovery_agent:
-            phase = self._discovery_agent._discovery_state.phase.value
-
-        if self._is_discovering:
-            discovery_status = "[yellow]Discovering...[/yellow]"
-        elif self._discovered_routine:
-            name = self._discovered_routine.name[:25]
-            discovery_status = f"[green]{name}[/green]"
-        else:
-            discovery_status = "[dim]Ready[/dim]"
-
-        return (
-            f"[bold cyan]Routine Discovery Beta[/bold cyan]\n"
-            f"[dim]\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500[/dim]\n"
-            f"[dim]Model:[/dim]     {self._llm_model.value}\n"
-            f"[dim]Messages:[/dim]  {msg_count}\n"
-            f"[dim]Tools:[/dim]     {self._tool_call_count}\n"
-            f"[dim]Context:[/dim]   {ctx_bar}\n"
-            f"[dim](est.)      ~{tokens_used:,} / {self._context_window_size:,}[/dim]\n"
-            f"[dim]\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500[/dim]\n"
-            f"[dim]Status:[/dim]    {discovery_status}\n"
-            f"[dim]Phase:[/dim]     {phase}\n"
-            f"[dim]Requests:[/dim]  {stats.total_requests}\n"
-            f"[dim]Hosts:[/dim]     {stats.unique_hosts}\n"
-            f"[dim]Browser:[/dim]   {self._remote_debugging_address or '[yellow]Not connected[/yellow]'}\n"
-            f"[dim]Time:[/dim]      {now}\n"
-        )
-
-    # ── Lifecycle override ────────────────────────────────────────────────
-
-    def on_mount(self) -> None:
-        super().on_mount()
-        if self._initial_task:
-            self._run_discovery(self._initial_task)
-
-    # ── Message handler override (adds state dumping) ─────────────────────
-
-    def _handle_message(self, message: BaseEmittedMessage) -> None:
-        super()._handle_message(message)
-        if self._discovery_agent:
-            self._dump_chat_thread()
-            self._dump_state_if_changed()
-
-    # ── Custom commands ──────────────────────────────────────────────────
-
-    def _handle_custom_command(self, cmd: str, raw_input: str) -> bool:
-        if raw_input.lower().startswith("/discover"):
-            task = raw_input[9:].strip()
-            chat = self.query_one("#chat-log", RichLog)
-            if not task:
-                chat.write(Text.from_markup(
-                    "[yellow]Usage: /discover <task>[/yellow]\n"
-                    "[dim]Example: /discover Search for trains from NYC to Boston[/dim]"
-                ))
-            else:
-                self._run_discovery(task)
-            return True
-
-        if cmd == "/execute":
-            self._handle_execute_command()
-            return True
-
-        if cmd == "/routine":
-            self._handle_routine_command()
-            return True
-
-        if raw_input.lower().startswith("/save"):
-            path = raw_input[5:].strip()
-            self._handle_save_command(path)
-            return True
-
-        return False
-
-    # ── Discovery ────────────────────────────────────────────────────────
-
-    @work(thread=True)
-    def _run_discovery(self, task: str) -> None:
-        """Run autonomous routine discovery in a background thread."""
-        chat = self.query_one("#chat-log", RichLog)
-
-        # Clear existing output directory
-        if self._output_dir.exists():
-            shutil.rmtree(self._output_dir)
-        self._last_state_hash = None
-
-        self.call_from_thread(
-            lambda: chat.write(Text.from_markup(
-                f"\n[bold magenta]Starting Discovery[/bold magenta]\n"
-                f"[dim]Task:[/dim] {escape(task)}"
-            ))
-        )
-
-        self._is_discovering = True
-        self._processing = True
-
-        try:
-            agent = RoutineDiscoveryAgentBeta(
-                emit_message_callable=self._handle_message,
-                stream_chunk_callable=self._handle_stream_chunk,
-                network_data_loader=self._network_data_loader,
-                task=task,
-                storage_data_loader=self._storage_data_loader,
-                window_property_data_loader=self._window_property_data_loader,
-                js_data_loader=self._js_data_loader,
-                interaction_data_loader=self._interaction_data_loader,
-                documentation_data_loader=self._documentation_data_loader,
-                llm_model=self._llm_model,
-                subagent_llm_model=self._subagent_llm_model,
-                max_iterations=self._max_iterations,
-                remote_debugging_address=self._remote_debugging_address,
-            )
-            self._discovery_agent = agent
-
-            routine = agent.run()
-
-            def _show_result() -> None:
-                chat.write("")
-                if routine:
-                    self._discovered_routine = routine
-                    chat.write(Text.from_markup(
-                        "[bold green]\u2713 Discovery completed successfully![/bold green]"
-                    ))
-
-                    lines = [
-                        f"[dim]Name:[/dim]       {routine.name}",
-                        f"[dim]Operations:[/dim] {len(routine.operations)}",
-                        f"[dim]Parameters:[/dim] {len(routine.parameters)}",
-                    ]
-                    if routine.parameters:
-                        param_names = ", ".join(p.name for p in routine.parameters)
-                        lines.append(f"[dim]Params:[/dim]     {param_names}")
-                    chat.write(Text.from_markup("\n".join(lines)))
-
-                    # Auto-save routine and test params
-                    self._output_dir.mkdir(parents=True, exist_ok=True)
-                    routine_path = self._output_dir / "routine.json"
-                    routine_path.write_text(json.dumps(routine.model_dump(), indent=2))
-                    self._add_saved_file(str(routine_path))
-
-                    test_params = agent._discovery_state.test_parameters
-                    if test_params:
-                        params_path = self._output_dir / "test_parameters.json"
-                        params_path.write_text(json.dumps(test_params, indent=2))
-                        self._add_saved_file(str(params_path))
-
-                    chat.write(Text.from_markup(
-                        f"[dim]Saved to: {self._output_dir}[/dim]"
-                    ))
-                else:
-                    chat.write(Text.from_markup(
-                        "[bold red]\u2717 Discovery failed - no routine produced[/bold red]"
-                    ))
-                chat.write("")
-                self._update_status()
-
-            self.call_from_thread(_show_result)
-
-        except Exception as e:
-            self.call_from_thread(
-                lambda: chat.write(Text.from_markup(
-                    f"[bold red]\u2717 Discovery error: {escape(str(e))}[/bold red]"
-                ))
-            )
-        finally:
-            self._is_discovering = False
-            self.call_from_thread(self._finish_processing)
-
-    # ── /execute ─────────────────────────────────────────────────────────
-
-    def _handle_execute_command(self) -> None:
-        """Validate preconditions then run execution in background."""
-        chat = self.query_one("#chat-log", RichLog)
-
-        if not self._discovered_routine:
-            chat.write(Text.from_markup(
-                "[yellow]No routine discovered yet. Use /discover <task> first.[/yellow]"
-            ))
-            return
-
-        if not self._remote_debugging_address:
-            chat.write(Text.from_markup(
-                "[yellow]No browser connected. Use --remote-debugging-address to connect.[/yellow]"
-            ))
-            return
-
-        self._run_execute()
-
-    @work(thread=True)
-    def _run_execute(self) -> None:
-        """Execute the discovered routine in a background thread."""
-        chat = self.query_one("#chat-log", RichLog)
-        self._processing = True
-
-        test_params: dict[str, str] = {}
-        if self._discovery_agent and self._discovery_agent._discovery_state.test_parameters:
-            test_params = self._discovery_agent._discovery_state.test_parameters
-
-        self.call_from_thread(
-            lambda: chat.write(Text.from_markup(
-                f"\n[bold magenta]Executing Routine[/bold magenta]\n"
-                f"[dim]Routine:[/dim] {self._discovered_routine.name}\n"
-                f"[dim]Params:[/dim]  {json.dumps(test_params)}"
-            ))
-        )
-
-        try:
-            result = execute_routine(
-                routine=self._discovered_routine.model_dump(),
-                parameters=test_params,
-                remote_debugging_address=self._remote_debugging_address,
-                timeout=120,
-                close_tab_when_done=True,
-            )
-
-            def _show_result() -> None:
-                if result.get("success"):
-                    exec_result = result.get("result")
-                    if exec_result and exec_result.ok and exec_result.data is not None:
-                        result_path = self._output_dir / "execution_result.json"
-                        result_path.write_text(json.dumps({
-                            "success": True,
-                            "data": exec_result.data,
-                        }, indent=2, default=str))
-
-                        chat.write(Text.from_markup(
-                            "[bold green]\u2713 Execution succeeded![/bold green]"
-                        ))
-                        data_str = json.dumps(exec_result.data, indent=2, default=str)
-                        preview = data_str[:500] + "..." if len(data_str) > 500 else data_str
-                        chat.write(preview)
-                        self._add_saved_file(str(result_path))
-                    else:
-                        chat.write(Text.from_markup(
-                            "[yellow]Execution completed but no data returned.[/yellow]"
-                        ))
-                else:
-                    error = result.get("error", "Unknown error")
-                    chat.write(Text.from_markup(
-                        f"[bold red]\u2717 Execution failed: {escape(str(error))}[/bold red]"
-                    ))
-                chat.write("")
-                self._update_status()
-
-            self.call_from_thread(_show_result)
-
-        except Exception as e:
-            self.call_from_thread(
-                lambda: chat.write(Text.from_markup(
-                    f"[bold red]\u2717 Execution error: {escape(str(e))}[/bold red]"
-                ))
-            )
-        finally:
-            self.call_from_thread(self._finish_processing)
-
-    # ── /routine ─────────────────────────────────────────────────────────
-
-    def _handle_routine_command(self) -> None:
-        """Show the discovered routine details in chat."""
-        chat = self.query_one("#chat-log", RichLog)
-
-        if not self._discovered_routine:
-            chat.write(Text.from_markup(
-                "[yellow]No routine discovered yet. Use /discover <task> first.[/yellow]"
-            ))
-            return
-
-        routine = self._discovered_routine
-        desc = routine.description[:100] + "..." if len(routine.description) > 100 else routine.description
-        lines = [
-            f"\n[bold cyan]Discovered Routine[/bold cyan]",
-            f"[dim]Name:[/dim]        {routine.name}",
-            f"[dim]Description:[/dim] {desc}",
-        ]
-
-        if routine.parameters:
-            lines.append("[dim]Parameters:[/dim]")
-            for p in routine.parameters:
-                obs = f" = {p.observed_value}" if p.observed_value else ""
-                lines.append(f"  {p.name} ({p.type}){obs}")
-        else:
-            lines.append("[dim]Parameters:[/dim]  None")
-
-        if routine.operations:
-            lines.append(f"[dim]Operations ({len(routine.operations)}):[/dim]")
-            for i, op in enumerate(routine.operations[:10], 1):
-                lines.append(f"  {i}. {op.type}")
-            if len(routine.operations) > 10:
-                lines.append(f"  [dim]... and {len(routine.operations) - 10} more[/dim]")
-
-        chat.write(Text.from_markup("\n".join(lines)))
-        chat.write("")
-
-    # ── /save ────────────────────────────────────────────────────────────
-
-    def _handle_save_command(self, path: str) -> None:
-        """Save the discovered routine to a JSON file."""
-        chat = self.query_one("#chat-log", RichLog)
-
-        if not self._discovered_routine:
-            chat.write(Text.from_markup(
-                "[yellow]No routine to save. Use /discover <task> first.[/yellow]"
-            ))
-            return
-
-        if not path.strip():
-            chat.write(Text.from_markup("[yellow]Usage: /save <path.json>[/yellow]"))
-            return
-
-        try:
-            save_path = Path(path.strip())
-            save_path.parent.mkdir(parents=True, exist_ok=True)
-            save_path.write_text(json.dumps(self._discovered_routine.model_dump(), indent=2))
-            chat.write(Text.from_markup(f"[green]\u2713 Routine saved to: {save_path}[/green]"))
-            self._add_saved_file(str(save_path))
-        except Exception as e:
-            chat.write(Text.from_markup(
-                f"[red]\u2717 Failed to save: {escape(str(e))}[/red]"
-            ))
-
-    # ── Reset override ───────────────────────────────────────────────────
-
-    def _on_reset(self) -> None:
-        self._discovery_agent = None
-        self._discovered_routine = None
-        self._is_discovering = False
-        self._last_state_hash = None
-
-    # ── Status in chat ───────────────────────────────────────────────────
-
-    def _show_status_in_chat(self) -> None:
-        """Show a compact status summary in the chat pane."""
-        chat = self.query_one("#chat-log", RichLog)
-        msg_count = len(self._agent.get_chats()) if self._agent else 0
-        tokens_used, ctx_pct = self._estimate_context_usage()
-        stats = self._network_data_loader.stats
-
-        phase = "N/A"
-        if self._discovery_agent:
-            phase = self._discovery_agent._discovery_state.phase.value
-
-        if self._is_discovering:
-            discovery_status = "Discovering..."
-        elif self._discovered_routine:
-            discovery_status = self._discovered_routine.name
-        else:
-            discovery_status = "Ready"
-
-        chat.write(Text.from_markup(
-            f"[bold cyan]Status[/bold cyan]\n"
-            f"  Model: {self._llm_model.value}\n"
-            f"  Messages: {msg_count}\n"
-            f"  Context: ~{tokens_used:,}t ({ctx_pct:.0f}%)\n"
-            f"  Discovery: {discovery_status}\n"
-            f"  Phase: {phase}\n"
-            f"  Requests: {stats.total_requests}\n"
-            f"  Hosts: {stats.unique_hosts}\n"
-            f"  Browser: {self._remote_debugging_address or 'Not connected'}"
-        ))
-
-    # ── State dumping ────────────────────────────────────────────────────
-
-    def _dump_chat_thread(self) -> None:
-        """Dump chat threads for the discovery agent and subagents."""
-        if not self._discovery_agent:
-            return
-
-        chat_threads_dir = self._output_dir / "chat_threads"
-        chat_threads_dir.mkdir(parents=True, exist_ok=True)
-
-        self._dump_agent_thread(
-            self._discovery_agent._thread,
-            self._discovery_agent._chats,
-            chat_threads_dir / "orchestration_agent.json",
-            agent_type="orchestration_agent",
-        )
-
-        for agent_id, agent_instance in self._discovery_agent._agent_instances.items():
-            subagent_info = self._discovery_agent._orchestration_state.subagents.get(agent_id)
-            agent_type = subagent_info.type.value if subagent_info else "unknown"
-            self._dump_agent_thread(
-                agent_instance._thread,
-                agent_instance._chats,
-                chat_threads_dir / f"{agent_type}_{agent_id}.json",
-                agent_type=agent_type,
-                agent_id=agent_id,
-            )
-
-    def _dump_agent_thread(
-        self,
-        thread: Any,
-        chats: dict[str, Any],
-        output_path: Path,
-        agent_type: str,
-        agent_id: str | None = None,
-    ) -> None:
-        """Dump a single agent's chat thread to a JSON file."""
-        messages = []
-        for chat_id in thread.chat_ids:
-            chat_msg = chats.get(chat_id)
-            if chat_msg:
-                messages.append({
-                    "id": chat_msg.id,
-                    "role": chat_msg.role.value,
-                    "content": chat_msg.content,
-                    "tool_calls": [tc.model_dump() for tc in chat_msg.tool_calls] if chat_msg.tool_calls else None,
-                    "tool_call_id": chat_msg.tool_call_id,
-                })
-
-        thread_data: dict[str, Any] = {
-            "agent_type": agent_type,
-            "thread_id": thread.id,
-            "updated_at": thread.updated_at,
-            "messages": messages,
-        }
-        if agent_id:
-            thread_data["agent_id"] = agent_id
-
-        output_path.write_text(json.dumps(thread_data, indent=2, default=str))
-
-    def _dump_state_if_changed(self) -> None:
-        """Dump discovery and orchestration state if changed."""
-        if not self._discovery_agent:
-            return
-
-        discovery_state = self._discovery_agent._discovery_state
-        orchestration_state = self._discovery_agent._orchestration_state
-
-        state_snapshot: dict[str, Any] = {
-            "discovery_state": {
-                "phase": discovery_state.phase.value,
-                "root_transaction": discovery_state.root_transaction.model_dump() if discovery_state.root_transaction else None,
-                "transaction_queue": list(discovery_state.transaction_queue),
-                "processed_transactions": list(discovery_state.processed_transactions),
-                "transaction_data": {
-                    tx_id: {
-                        "request": tx_data.get("request"),
-                        "extracted_variables": tx_data["extracted_variables"].model_dump() if tx_data.get("extracted_variables") else None,
-                        "resolved_variables": [rv.model_dump() for rv in tx_data.get("resolved_variables", [])],
-                    }
-                    for tx_id, tx_data in discovery_state.transaction_data.items()
-                },
-                "production_routine": discovery_state.production_routine.model_dump() if discovery_state.production_routine else None,
-                "test_parameters": discovery_state.test_parameters,
-                "construction_attempts": discovery_state.construction_attempts,
-            },
-            "orchestration_state": {
-                "tasks": {
-                    task_id: {
-                        "id": task.id,
-                        "agent_type": task.agent_type.value,
-                        "status": task.status.value,
-                        "prompt": task.prompt,
-                        "result": task.result,
-                        "error": task.error,
-                        "loops_used": task.loops_used,
-                        "max_loops": task.max_loops,
-                    }
-                    for task_id, task in orchestration_state.tasks.items()
-                },
-                "subagents": {
-                    sa_id: {
-                        "id": subagent.id,
-                        "type": subagent.type.value,
-                        "task_ids": subagent.task_ids,
-                    }
-                    for sa_id, subagent in orchestration_state.subagents.items()
-                },
-            },
-        }
-
-        state_json = json.dumps(state_snapshot, sort_keys=True, default=str)
-        state_hash = hashlib.sha256(state_json.encode()).hexdigest()[:16]
-
-        if state_hash == self._last_state_hash:
-            return
-
-        self._last_state_hash = state_hash
-
-        state_dir = self._output_dir / "state"
-        state_dir.mkdir(parents=True, exist_ok=True)
-
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-        state_path = state_dir / f"{timestamp}.json"
-
-        state_snapshot["_meta"] = {
-            "timestamp": timestamp,
-            "hash": state_hash,
-        }
-
-        state_path.write_text(json.dumps(state_snapshot, indent=2, default=str))
-
-
-# ─── Entry point ─────────────────────────────────────────────────────────────
-
-def main() -> None:
-    """Entry point for the Routine Discovery Beta agent TUI."""
-    parser = argparse.ArgumentParser(description="Routine Discovery Beta Agent \u2014 Multi-pane TUI")
-
-    # CDP captures directory
-    parser.add_argument(
-        "--cdp-captures-dir",
-        type=str,
-        default=None,
-        help="Directory with CDP captures. Auto-discovers JSONL files within.",
-    )
-
-    # Individual JSONL file paths
-    parser.add_argument("--network-jsonl", type=str, default=None, help="Path to network events JSONL file.")
-    parser.add_argument("--storage-jsonl", type=str, default=None, help="Path to storage events JSONL file.")
-    parser.add_argument("--window-props-jsonl", type=str, default=None, help="Path to window properties JSONL file.")
-    parser.add_argument("--js-jsonl", type=str, default=None, help="Path to JavaScript events JSONL file.")
-    parser.add_argument("--interaction-jsonl", type=str, default=None, help="Path to interaction events JSONL file.")
-
-    # Task
-    parser.add_argument("--task", type=str, default=None, help="Task description for immediate discovery (optional).")
-
-    # Output and model options
-    parser.add_argument("--output-dir", type=str, default="./routine_discovery_output", help="Output directory.")
-    add_model_argument(parser)
-    parser.add_argument(
-        "--subagent-model",
-        type=str,
-        default=None,
-        help="LLM model for subagents (defaults to --model).",
-    )
-    parser.add_argument(
-        "--remote-debugging-address",
-        type=str,
-        default=None,
-        help="Chrome remote debugging address (e.g., http://127.0.0.1:9222).",
-    )
-    parser.add_argument(
-        "--max-iterations",
-        type=int,
-        default=50,
-        help="Max iterations for discovery loop.",
-    )
-    parser.add_argument("-q", "--quiet", action="store_true", help="Suppress logs")
-    parser.add_argument("--log-file", type=str, default=None, help="Log to file")
-
-    args = parser.parse_args()
-
-    console = Console()
-
-    # Validate API key
-    if Config.OPENAI_API_KEY is None:
-        console.print("[bold red]Error: OPENAI_API_KEY environment variable is not set[/bold red]")
-        sys.exit(1)
-
-    # Resolve JSONL paths — explicit paths take precedence over cdp-captures-dir
-    network_jsonl = args.network_jsonl
-    storage_jsonl = args.storage_jsonl
-    window_props_jsonl = args.window_props_jsonl
-    js_jsonl = args.js_jsonl
-    interaction_jsonl = args.interaction_jsonl
-
-    if args.cdp_captures_dir:
-        cdp_dir = Path(args.cdp_captures_dir)
-        if not network_jsonl:
-            candidate = cdp_dir / "network" / "events.jsonl"
-            if candidate.exists():
-                network_jsonl = str(candidate)
-        if not storage_jsonl:
-            candidate = cdp_dir / "storage" / "events.jsonl"
-            if candidate.exists():
-                storage_jsonl = str(candidate)
-        if not window_props_jsonl:
-            candidate = cdp_dir / "window_properties" / "events.jsonl"
-            if candidate.exists():
-                window_props_jsonl = str(candidate)
-        if not js_jsonl:
-            candidate = cdp_dir / "network" / "javascript_events.jsonl"
-            if candidate.exists():
-                js_jsonl = str(candidate)
-        if not interaction_jsonl:
-            candidate = cdp_dir / "interaction" / "events.jsonl"
-            if candidate.exists():
-                interaction_jsonl = str(candidate)
-
-    if not network_jsonl:
-        console.print("[bold red]Error: No network data source provided. Use --network-jsonl or --cdp-captures-dir[/bold red]")
-        sys.exit(1)
-
-    try:
-        llm_model = resolve_model(args.model, console)
-        subagent_model = resolve_model(args.subagent_model, console) if args.subagent_model else None
-
-        # Load data
-        with console.status("[bold blue]Loading data...[/bold blue]") as status:
-            status.update("[bold blue]Loading network data...[/bold blue]")
-            network_data_loader = NetworkDataLoader(network_jsonl)
-
-            storage_data_loader: StorageDataLoader | None = None
-            if storage_jsonl and Path(storage_jsonl).exists():
-                status.update("[bold blue]Loading storage data...[/bold blue]")
-                storage_data_loader = StorageDataLoader(storage_jsonl)
-
-            window_property_data_loader: WindowPropertyDataLoader | None = None
-            if window_props_jsonl and Path(window_props_jsonl).exists():
-                status.update("[bold blue]Loading window property data...[/bold blue]")
-                window_property_data_loader = WindowPropertyDataLoader(window_props_jsonl)
-
-            js_data_loader: JSDataLoader | None = None
-            if js_jsonl and Path(js_jsonl).exists():
-                status.update("[bold blue]Loading JS data...[/bold blue]")
-                js_data_loader = JSDataLoader(js_jsonl)
-
-            interaction_data_loader: InteractionsDataLoader | None = None
-            if interaction_jsonl and Path(interaction_jsonl).exists():
-                status.update("[bold blue]Loading interaction data...[/bold blue]")
-                interaction_data_loader = InteractionsDataLoader.from_jsonl(interaction_jsonl)
-
-            status.update("[bold blue]Loading documentation...[/bold blue]")
-            docs_dir = str(BLUEBOX_PACKAGE_ROOT / "agent_docs")
-            code_paths = [
-                str(BLUEBOX_PACKAGE_ROOT / "data_models" / "routine"),
-                str(BLUEBOX_PACKAGE_ROOT / "data_models" / "ui_elements.py"),
-                str(BLUEBOX_PACKAGE_ROOT / "agents" / "routine_discovery_agent.py"),
-                str(BLUEBOX_PACKAGE_ROOT / "llms" / "infra" / "data_store.py"),
-                str(BLUEBOX_PACKAGE_ROOT / "utils" / "js_utils.py"),
-                str(BLUEBOX_PACKAGE_ROOT / "utils" / "data_utils.py"),
-                "!" + str(BLUEBOX_PACKAGE_ROOT / "**" / "__init__.py"),
-            ]
-            documentation_data_loader = DocumentationDataLoader(
-                documentation_paths=[docs_dir],
-                code_paths=code_paths,
-            )
-
-        console.print(f"[green]\u2713 Loaded {network_data_loader.stats.total_requests} network transactions[/green]")
-        console.print()
-
-        # Redirect logging before TUI takes over
-        enable_tui_logging(log_file=args.log_file or ".bluebox_routine_discovery_agent_beta_tui.log", quiet=args.quiet)
-
-        app = RotutineDiscoveryBetaTUI(
-            llm_model=llm_model,
-            network_data_loader=network_data_loader,
-            storage_data_loader=storage_data_loader,
-            window_property_data_loader=window_property_data_loader,
-            js_data_loader=js_data_loader,
-            interaction_data_loader=interaction_data_loader,
-            documentation_data_loader=documentation_data_loader,
-            subagent_llm_model=subagent_model,
-            max_iterations=args.max_iterations,
-            remote_debugging_address=args.remote_debugging_address,
-            output_dir=Path(args.output_dir),
-            initial_task=args.task,
-        )
-        app.run()
-
-    except ValueError as e:
-        console.print(f"[bold red]Error: {e}[/bold red]")
-        sys.exit(1)
-    except Exception as e:
-        console.print(f"[bold red]Fatal error: {e}[/bold red]")
-        raise
-
-
-if __name__ == "__main__":
-    main()
diff --git a/bluebox/scripts/specialists/run_dom_specialist.py b/bluebox/scripts/specialists/run_dom_specialist.py
new file mode 100644
index 00000000..4088a8d1
--- /dev/null
+++ b/bluebox/scripts/specialists/run_dom_specialist.py
@@ -0,0 +1,315 @@
+"""
+bluebox/scripts/specialists/run_dom_specialist.py
+
+Multi-pane terminal UI for the DOMSpecialist using Textual.
+
+Layout:
+  +-----------------------------+----------------------+
+  |                             |  Tool Calls History   |
+  |       Chat (scrolling)      |                       |
+  |                             +----------------------+
+  |  +------------------------+ |  Status / Stats       |
+  |  | Input                  | |                       |
+  |  +------------------------+ |                       |
+  +-----------------------------+----------------------+
+
+Usage:
+    bluebox-dom-specialist --jsonl-path ./cdp_captures/dom/events.jsonl
+    bluebox-dom-specialist --jsonl-path ./cdp_captures/dom/events.jsonl --model gpt-5.1
+    bluebox-dom-specialist --jsonl-path ./cdp_captures/dom/events.jsonl --workspace-dir ./agent_workspace/dom_specialist
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from rich.console import Console
+from rich.markup import escape
+from rich.text import Text
+from textual import work
+from textual.widgets import RichLog
+
+from bluebox.agents.specialists.dom_specialist import DOMSpecialist
+from bluebox.workspace import LocalAgentWorkspace
+from bluebox.data_models.llms.vendors import LLMModel
+from bluebox.data_models.orchestration.result import SpecialistResultWrapper
+from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader
+from bluebox.utils.cli_utils import add_model_argument, resolve_model
+from bluebox.utils.logger import enable_tui_logging
+from bluebox.utils.tui_base import AbstractAgentTUI, BASE_SLASH_COMMANDS
+
+if TYPE_CHECKING:
+    from bluebox.agents.abstract_agent import AbstractAgent
+
+
+# --- Slash commands -----------------------------------------------------------
+
+SLASH_COMMANDS: dict[str, str] = {
+    "/discover": "Run autonomous DOM structure discovery for a task",
+    **BASE_SLASH_COMMANDS,
+}
+
+HELP_TEXT = """\
+[bold]Commands:[/bold]
+  [cyan]/discover <task>[/cyan]  Run autonomous DOM structure discovery
+  [cyan]/status[/cyan]           Show current state
+  [cyan]/chats[/cyan]            Show message history
+  [cyan]/clear[/cyan]            Clear the chat display
+  [cyan]/reset[/cyan]            Start new conversation
+  [cyan]/help[/cyan]             Show this help
+  [cyan]/quit[/cyan]             Exit
+"""
+
+
+# --- Textual App --------------------------------------------------------------
+
+class DOMSpecialistTUI(AbstractAgentTUI):
+    """Multi-pane TUI for the DOM Specialist."""
+
+    TITLE = "DOM Specialist"
+    SLASH_COMMANDS = SLASH_COMMANDS
+    HELP_TEXT = HELP_TEXT
+
+    def __init__(
+        self,
+        llm_model: LLMModel,
+        dom_data_loader: DOMDataLoader,
+        data_path: str = "",
+        workspace_dir: str | None = None,
+    ) -> None:
+        super().__init__(llm_model, working_dir=workspace_dir)
+        self._dom_data_loader = dom_data_loader
+        self._data_path = data_path
+        self._workspace = LocalAgentWorkspace.from_directory_path(
+            workspace_dir or "./agent_workspace/dom_specialist",
+        )
+        if self._data_path:
+            self._workspace.attach_input_file("dom_events", self._data_path)
+
+    # -- Abstract implementations ----------------------------------------------
+
+    def _create_agent(self) -> AbstractAgent:
+        return DOMSpecialist(
+            emit_message_callable=self._handle_message,
+            stream_chunk_callable=self._handle_stream_chunk,
+            dom_data_loader=self._dom_data_loader,
+            llm_model=self._llm_model,
+            workspace=self._workspace,
+        )
+
+    def _print_welcome(self) -> None:
+        chat = self.query_one("#chat-log", RichLog)
+        chat.write(Text.from_markup(
+            "[bold cyan]DOM Specialist[/bold cyan]  "
+            "[dim]powered by Vectorly[/dim]"
+        ))
+        chat.write("")
+
+        stats = self._dom_data_loader.stats
+
+        lines = [
+            f"[dim]Total Snapshots:[/dim]  {stats.total_snapshots}",
+            f"[dim]Unique URLs:[/dim]      {stats.unique_urls}",
+            f"[dim]Unique Titles:[/dim]    {stats.unique_titles}",
+            f"[dim]Total Strings:[/dim]    {stats.total_strings}",
+        ]
+
+        if stats.hosts:
+            hosts_str = ", ".join(
+                f"{h}: {c}" for h, c in sorted(stats.hosts.items(), key=lambda x: -x[1])
+            )
+            lines.append(f"[dim]Hosts:[/dim]           {hosts_str}")
+
+        if self._data_path:
+            lines.append(f"[dim]File:[/dim]            {self._data_path}")
+
+        chat.write(Text.from_markup("\n".join(lines)))
+        chat.write("")
+
+    def _build_status_text(self) -> str:
+        now = datetime.now().strftime("%Y-%m-%d %H:%M")
+        msg_count = len(self._agent.get_chats()) if self._agent else 0
+        tokens_used, ctx_pct = self._estimate_context_usage()
+        ctx_bar = self._context_bar(ctx_pct)
+        stats = self._dom_data_loader.stats
+
+        return (
+            f"[bold cyan]DOM[/bold cyan]\n"
+            f"[dim]\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500[/dim]\n"
+            f"[dim]Model:[/dim]     {self._llm_model.value}\n"
+            f"[dim]Messages:[/dim]  {msg_count}\n"
+            f"[dim]Tools:[/dim]     {self._tool_call_count}\n"
+            f"[dim]Context:[/dim]   {ctx_bar}\n"
+            f"[dim](est.)      ~{tokens_used:,} / {self._context_window_size:,}[/dim]\n"
+            f"[dim]\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500[/dim]\n"
+            f"[dim]Snapshots:[/dim] {stats.total_snapshots}\n"
+            f"[dim]URLs:[/dim]      {stats.unique_urls}\n"
+            f"[dim]Strings:[/dim]   {stats.total_strings}\n"
+            f"[dim]Time:[/dim]      {now}\n"
+        )
+
+    # -- Custom commands -------------------------------------------------------
+
+    def _handle_custom_command(self, cmd: str, raw_input: str) -> bool:
+        if raw_input.lower().startswith("/discover"):
+            task = raw_input[9:].strip()
+            chat = self.query_one("#chat-log", RichLog)
+            if not task:
+                chat.write(Text.from_markup("[yellow]Usage: /discover <task>[/yellow]"))
+            else:
+                self._run_discovery(task)
+            return True
+        return False
+
+    # -- Autonomous discovery --------------------------------------------------
+
+    @work(thread=True)
+    def _run_discovery(self, task: str) -> None:
+        """Run autonomous DOM structure discovery in a background thread."""
+        chat = self.query_one("#chat-log", RichLog)
+
+        self.call_from_thread(
+            lambda: chat.write(Text.from_markup(
+                f"\n[bold cyan]Starting Autonomous DOM Discovery[/bold cyan]\n"
+                f"[dim]Task:[/dim] {escape(task)}"
+            ))
+        )
+
+        self._agent.reset()
+        self._last_seen_chat_count = 0
+
+        start_time = time.perf_counter()
+        result = self._agent.run_autonomous(task)
+        elapsed = time.perf_counter() - start_time
+        iterations = self._agent.autonomous_iteration
+
+        def _show_result() -> None:
+            chat.write("")
+
+            if isinstance(result, SpecialistResultWrapper) and result.success and result.output:
+                output_str = json.dumps(result.output, indent=2)
+                chat.write(Text.from_markup(
+                    f"[bold green]\u2713 DOM Discovery Complete[/bold green] "
+                    f"[dim]({iterations} iterations, {elapsed:.1f}s)[/dim]"
+                ))
+                output_lines = output_str.split("\n")
+                if len(output_lines) > 40:
+                    output_str = "\n".join(output_lines[:40]) + f"\n... ({len(output_lines) - 40} more lines)"
+                chat.write(output_str)
+
+                self._add_tool_node(
+                    Text.assemble(
+                        ("DISCOVERY RESULT", "green"),
+                        " ",
+                        (f"({iterations} iter, {elapsed:.1f}s)", "dim"),
+                    ),
+                    output_str.split("\n"),
+                )
+
+            elif isinstance(result, SpecialistResultWrapper) and not result.success:
+                reason = result.failure_reason or "Unknown"
+                chat.write(Text.from_markup(
+                    f"[bold red]\u2717 DOM Discovery Failed[/bold red] "
+                    f"[dim]({iterations} iterations, {elapsed:.1f}s)[/dim]\n"
+                    f"[red]Reason:[/red] {escape(reason)}"
+                ))
+                if result.notes:
+                    notes_str = "\n".join(f"  - {n}" for n in result.notes[:10])
+                    chat.write(Text.from_markup(f"[dim]Notes:[/dim]\n{notes_str}"))
+
+            else:
+                chat.write(Text.from_markup(
+                    f"[bold yellow]\u26a0 Discovery Incomplete[/bold yellow] "
+                    f"[dim]({iterations} iterations, {elapsed:.1f}s)[/dim]\n"
+                    "[yellow]Agent reached max iterations without finalizing.[/yellow]"
+                ))
+
+            chat.write("")
+            self._update_status()
+
+        self.call_from_thread(_show_result)
+
+    # -- Overrides -------------------------------------------------------------
+
+    def _show_status_in_chat(self) -> None:
+        """Show a compact status summary in the chat pane."""
+        chat = self.query_one("#chat-log", RichLog)
+        stats = self._dom_data_loader.stats
+        msg_count = len(self._agent.get_chats()) if self._agent else 0
+        tokens_used, ctx_pct = self._estimate_context_usage()
+
+        chat.write(Text.from_markup(
+            f"[bold cyan]Status[/bold cyan]\n"
+            f"  Model: {self._llm_model.value}\n"
+            f"  Messages: {msg_count}\n"
+            f"  Context: ~{tokens_used:,}t ({ctx_pct:.0f}%)\n"
+            f"  Snapshots: {stats.total_snapshots}\n"
+            f"  URLs: {stats.unique_urls}\n"
+            f"  Strings: {stats.total_strings}\n"
+            f"  File: {self._data_path or 'N/A'}"
+        ))
+
+
+# --- Entry point --------------------------------------------------------------
+
+def main() -> None:
+    """Entry point for the DOM specialist TUI."""
+    parser = argparse.ArgumentParser(description="DOM Specialist \u2014 Multi-pane TUI")
+    parser.add_argument(
+        "--jsonl-path",
+        type=str,
+        required=True,
+        help="Path to the JSONL file containing DOM snapshot events",
+    )
+    add_model_argument(parser)
+    parser.add_argument("-q", "--quiet", action="store_true", help="Suppress logs")
+    parser.add_argument("--log-file", type=str, default=None, help="Log to file")
+    parser.add_argument(
+        "--workspace-dir",
+        type=str,
+        default="./agent_workspace/dom_specialist",
+        help="Workspace directory for tool results, artifacts, and code execution files.",
+    )
+    args = parser.parse_args()
+
+    console = Console()
+
+    # Load JSONL file
+    jsonl_path = Path(args.jsonl_path)
+    if not jsonl_path.exists():
+        console.print(f"[bold red]Error: JSONL file not found: {jsonl_path}[/bold red]")
+        sys.exit(1)
+
+    console.print(f"[dim]Loading JSONL file: {jsonl_path}[/dim]")
+
+    try:
+        dom_data_loader = DOMDataLoader(str(jsonl_path))
+    except (ValueError, FileNotFoundError) as e:
+        console.print(f"[bold red]Error parsing JSONL file: {e}[/bold red]")
+        sys.exit(1)
+
+    llm_model = resolve_model(args.model, console)
+
+    console.print(f"[green]\u2713 Loaded {dom_data_loader.stats.total_snapshots} DOM snapshots[/green]")
+    console.print()
+
+    # Redirect logging + stderr right before TUI takes over
+    enable_tui_logging(log_file=args.log_file or ".bluebox_dom_tui.log", quiet=args.quiet)
+
+    app = DOMSpecialistTUI(
+        llm_model=llm_model,
+        dom_data_loader=dom_data_loader,
+        data_path=str(jsonl_path),
+        workspace_dir=args.workspace_dir,
+    )
+    app.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bluebox/scripts/specialists/run_experiment_worker_agent.py b/bluebox/scripts/specialists/run_experiment_worker_agent.py
new file mode 100644
index 00000000..bdbbdabd
--- /dev/null
+++ b/bluebox/scripts/specialists/run_experiment_worker_agent.py
@@ -0,0 +1,417 @@
+"""
+bluebox/scripts/specialists/run_experiment_worker_agent.py
+
+Interactive TUI for ExperimentWorker.
+
+Features:
+- Chat directly with the worker agent.
+- Run `/experiement <text prompt>` to execute an autonomous worker run
+  against the mounted capture data and live browser tools.
+
+Autonomous mode example:
+    python -m bluebox.scripts.specialists.run_experiment_worker_agent \
+      --cdp-captures-dir ./cdp_captures \
+      --workspace-dir ./agent_workspaces/experiment_worker_agent \
+      --remote-debugging-address http://127.0.0.1:9222 \
+      --model gpt-5.2
+
+Inside TUI:
+    /experiement replay the exact captured request and return a minimal reproducible recipe
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+from pydantic import BaseModel
+from rich.console import Console
+from rich.markup import escape
+from rich.text import Text
+from textual import work
+from textual.widgets import RichLog
+
+from bluebox.agents.abstract_agent import AutonomousRunConfig
+from bluebox.workspace import LocalAgentWorkspace
+from bluebox.agents.workers.experiment_worker import ExperimentWorker
+from bluebox.data_models.llms.vendors import LLMModel
+from bluebox.data_models.orchestration.result import SpecialistResultWrapper
+from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader
+from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader
+from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
+from bluebox.llms.data_loaders.storage_data_loader import StorageDataLoader
+from bluebox.llms.data_loaders.window_property_data_loader import WindowPropertyDataLoader
+from bluebox.utils.cli_utils import add_model_argument, resolve_model
+from bluebox.utils.logger import enable_tui_logging, get_logger
+from bluebox.utils.tui_base import AbstractAgentTUI, BASE_HELP_TEXT, BASE_SLASH_COMMANDS
+
+logger = get_logger(name=__name__)
+BLUEBOX_PACKAGE_ROOT = Path(__file__).resolve().parent.parent.parent
+
+DEFAULT_WORKER_TIMEOUT_SECONDS = 300
+DEFAULT_WORKER_MAX_ITERATIONS = 30
+
+
+def _build_documentation_loader() -> DocumentationDataLoader:
+    """Build the same docs/code loader family used by agent TUIs."""
+    docs_dir = str(BLUEBOX_PACKAGE_ROOT / "agent_docs")
+    code_paths = [
+        str(BLUEBOX_PACKAGE_ROOT / "data_models" / "routine"),
+        str(BLUEBOX_PACKAGE_ROOT / "data_models" / "ui_elements.py"),
+        str(BLUEBOX_PACKAGE_ROOT / "agents" / "routine_discovery_agent.py"),
+        str(BLUEBOX_PACKAGE_ROOT / "llms" / "infra" / "data_store.py"),
+        str(BLUEBOX_PACKAGE_ROOT / "utils" / "js_utils.py"),
+        str(BLUEBOX_PACKAGE_ROOT / "utils" / "data_utils.py"),
+        "!" + str(BLUEBOX_PACKAGE_ROOT / "**" / "__init__.py"),
+    ]
+    return DocumentationDataLoader(
+        documentation_paths=[docs_dir],
+        code_paths=code_paths,
+    )
+
+
+def _collect_capture_input_files(cdp_captures_dir: Path) -> list[tuple[str, Path]]:
+    """Collect capture JSONL files and map them to mounted input names."""
+    candidates: list[tuple[str, Path]] = [
+        ("network_events", cdp_captures_dir / "network" / "events.jsonl"),
+        ("storage_events", cdp_captures_dir / "storage" / "events.jsonl"),
+        ("dom_events", cdp_captures_dir / "dom" / "events.jsonl"),
+        ("window_properties_events", cdp_captures_dir / "window_properties" / "events.jsonl"),
+        ("js_events", cdp_captures_dir / "js" / "events.jsonl"),
+        ("interaction_events", cdp_captures_dir / "interaction" / "events.jsonl"),
+    ]
+    return [(name, path) for name, path in candidates if path.exists()]
+
+
+def _mount_capture_inputs(workspace: LocalAgentWorkspace, capture_inputs: list[tuple[str, Path]]) -> None:
+    """Attach capture inputs into workspace raw/."""
+    for name, source_path in capture_inputs:
+        workspace.attach_input_file(name=name, source_path=source_path)
+
+
+def _load_if_exists(loader_cls: type, jsonl_path: Path) -> Any | None:
+    """Load a data loader only when the JSONL path exists."""
+    if not jsonl_path.exists():
+        return None
+    try:
+        return loader_cls(jsonl_path=str(jsonl_path))
+    except Exception as e:
+        logger.warning("Failed to load %s from %s: %s", loader_cls.__name__, jsonl_path, e)
+        return None
+
+
+class ExperimentWorkerAgentTUI(AbstractAgentTUI):
+    """TUI for chatting with ExperimentWorker and running autonomous experiments."""
+
+    TITLE = "Experiment Worker Agent"
+    SHOW_SAVED_FILES_PANE = True
+    SLASH_COMMANDS = {
+        **BASE_SLASH_COMMANDS,
+        "/experiement": "Run autonomous experiment from plain text prompt",
+    }
+    HELP_TEXT = BASE_HELP_TEXT + (
+        "\n    [cyan]/experiement[/cyan]    Run autonomous worker experiment"
+        "\n                       [dim]Usage: /experiement <text prompt>[/dim]"
+        "\n"
+    )
+
+    def __init__(
+        self,
+        llm_model: LLMModel,
+        workspace_dir: str,
+        remote_debugging_address: str | None,
+        cdp_captures_dir: Path | None,
+        worker_timeout_seconds: int,
+        max_worker_iterations: int,
+    ) -> None:
+        super().__init__(llm_model=llm_model, working_dir=workspace_dir)
+        self._workspace_dir = workspace_dir
+        self._remote_debugging_address = remote_debugging_address
+        self._worker_timeout_seconds = worker_timeout_seconds
+        self._max_worker_iterations = max_worker_iterations
+
+        self._workspace = LocalAgentWorkspace.from_directory_path(self._workspace_dir)
+        self._documentation_loader = _build_documentation_loader()
+
+        self._network_loader: NetworkDataLoader | None = None
+        self._storage_loader: StorageDataLoader | None = None
+        self._dom_loader: DOMDataLoader | None = None
+        self._window_loader: WindowPropertyDataLoader | None = None
+
+        if cdp_captures_dir is not None:
+            capture_inputs = _collect_capture_input_files(cdp_captures_dir)
+            _mount_capture_inputs(self._workspace, capture_inputs)
+            self._network_loader = _load_if_exists(
+                NetworkDataLoader, cdp_captures_dir / "network" / "events.jsonl",
+            )
+            self._storage_loader = _load_if_exists(
+                StorageDataLoader, cdp_captures_dir / "storage" / "events.jsonl",
+            )
+            self._dom_loader = _load_if_exists(
+                DOMDataLoader, cdp_captures_dir / "dom" / "events.jsonl",
+            )
+            self._window_loader = _load_if_exists(
+                WindowPropertyDataLoader, cdp_captures_dir / "window_properties" / "events.jsonl",
+            )
+
+        self._last_experiment: dict[str, Any] | None = None
+
+    def _create_agent(self) -> ExperimentWorker:
+        return ExperimentWorker(
+            emit_message_callable=self._handle_message,
+            stream_chunk_callable=self._handle_stream_chunk,
+            llm_model=self._llm_model,
+            workspace=self._workspace,
+            documentation_data_loader=self._documentation_loader,
+            remote_debugging_address=self._remote_debugging_address,
+            network_data_loader=self._network_loader,
+            storage_data_loader=self._storage_loader,
+            dom_data_loader=self._dom_loader,
+            window_property_data_loader=self._window_loader,
+        )
+
+    def _create_runtime_worker(self) -> ExperimentWorker:
+        """Create a fresh worker instance for `/experiement` runs."""
+        return ExperimentWorker(
+            emit_message_callable=self._handle_message,
+            stream_chunk_callable=self._handle_stream_chunk,
+            llm_model=self._llm_model,
+            workspace=self._workspace,
+            documentation_data_loader=self._documentation_loader,
+            remote_debugging_address=self._remote_debugging_address,
+            network_data_loader=self._network_loader,
+            storage_data_loader=self._storage_loader,
+            dom_data_loader=self._dom_loader,
+            window_property_data_loader=self._window_loader,
+        )
+
+    def _print_welcome(self) -> None:
+        chat = self.query_one("#chat-log", RichLog)
+        chat.write(Text.from_markup(
+            "[bold green]Experiment Worker Agent[/bold green]  "
+            "[dim]chat + autonomous experiment harness[/dim]"
+        ))
+        chat.write("")
+        chat.write(Text.from_markup(
+            f"[dim]Model:[/dim]       {self._llm_model.value}\n"
+            f"[dim]Browser:[/dim]     {self._remote_debugging_address or '(disabled)'}\n"
+            f"[dim]Workspace:[/dim]   {self._workspace_dir}\n"
+            f"[dim]Data loaders:[/dim] network={bool(self._network_loader)} "
+            f"storage={bool(self._storage_loader)} dom={bool(self._dom_loader)} "
+            f"window={bool(self._window_loader)}\n"
+            f"[dim]Tip:[/dim]         /experiement replay captured journey-solution-option POST and return exact minimal headers/body"
+        ))
+        chat.write("")
+
+    def _build_status_text(self) -> str:
+        now = datetime.now().strftime("%Y-%m-%d %H:%M")
+        msg_count = len(self._agent.get_chats()) if self._agent else 0
+        tokens_used, ctx_pct = self._estimate_context_usage()
+        ctx_bar = self._context_bar(ctx_pct)
+
+        last = self._last_experiment or {}
+        last_status = last.get("status", "n/a")
+        last_iters = last.get("iterations", "n/a")
+        return (
+            f"[bold green]EXPERIMENT WORKER[/bold green]\n"
+            f"[dim]──────────────────[/dim]\n"
+            f"[dim]Model:[/dim]     {self._llm_model.value}\n"
+            f"[dim]Messages:[/dim]  {msg_count}\n"
+            f"[dim]Tools:[/dim]     {self._tool_call_count}\n"
+            f"[dim]Last run:[/dim]  {last_status} ({last_iters} iters)\n"
+            f"[dim]Context:[/dim]   {ctx_bar}\n"
+            f"[dim](est.)      ~{tokens_used:,} / {self._context_window_size:,}[/dim]\n"
+            f"[dim]──────────────────[/dim]\n"
+            f"[dim]Time:[/dim]      {now}\n"
+        )
+
+    def _handle_custom_command(self, cmd: str, raw_input: str) -> bool:
+        lower = raw_input.lower()
+        if not lower.startswith("/experiement"):
+            return False
+
+        chat = self.query_one("#chat-log", RichLog)
+        parts = raw_input.split(maxsplit=1)
+        if len(parts) < 2 or not parts[1].strip():
+            chat.write(Text.from_markup(
+                "[yellow]Usage:[/yellow] /experiement <text prompt>\n"
+                "[dim]Example:[/dim] /experiement find the exact request needed for station alerts and return a minimal reproducible recipe"
+            ))
+            return True
+
+        task = parts[1].strip()
+        chat.write(Text.from_markup(
+            f"[yellow]Running autonomous experiment[/yellow]\n"
+            f"[dim]Task:[/dim] {escape(task)}"
+        ))
+
+        self._processing = True
+        self._assistant_header_printed = False
+        self._status_update_printed = False
+        self._run_experiment_async(task=task)
+        return True
+
+    @work(thread=True)
+    def _run_experiment_async(self, task: str) -> None:
+        worker: ExperimentWorker | None = None
+        start_time = time.perf_counter()
+        try:
+            worker = self._create_runtime_worker()
+            config = AutonomousRunConfig(
+                min_iterations=1,
+                max_iterations=self._max_worker_iterations,
+            )
+            with ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(worker.run_autonomous, task, config)
+                result = future.result(timeout=self._worker_timeout_seconds)
+
+            elapsed = time.perf_counter() - start_time
+            iterations = worker.autonomous_iteration
+
+            raw_result = result.model_dump() if isinstance(result, BaseModel) else result
+            if raw_result is None:
+                raw_result = {"result": None}
+            if not isinstance(raw_result, dict):
+                raw_result = {"result": raw_result}
+
+            status = "incomplete"
+            if isinstance(result, SpecialistResultWrapper):
+                status = "success" if result.success else "failure"
+
+            record = {
+                "task": task,
+                "status": status,
+                "iterations": iterations,
+                "elapsed_seconds": round(elapsed, 3),
+                "result": raw_result,
+            }
+            ts = datetime.now().strftime("%Y%m%d-%H%M%S")
+            ref = self._workspace.save_artifact(
+                source="output",
+                filename=f"experiment_worker_{ts}.json",
+                content=json.dumps(record, indent=2, default=str),
+                tool_name="run_experiment_worker_agent",
+                content_type="json",
+                metadata={"task": task[:200]},
+            )
+
+            self.call_from_thread(
+                self._show_experiment_success,
+                status,
+                iterations,
+                elapsed,
+                ref.relative_path,
+            )
+        except FuturesTimeoutError:
+            self.call_from_thread(
+                self._show_experiment_error,
+                f"Experiment timed out after {self._worker_timeout_seconds}s",
+            )
+        except Exception as e:
+            self.call_from_thread(self._show_experiment_error, str(e))
+        finally:
+            if worker is not None:
+                try:
+                    worker.close()
+                except Exception:
+                    logger.exception("Failed to close worker browser tab")
+
+    def _show_experiment_success(
+        self,
+        status: str,
+        iterations: int,
+        elapsed: float,
+        saved_relative_path: str,
+    ) -> None:
+        self._last_experiment = {
+            "status": status,
+            "iterations": iterations,
+            "elapsed_seconds": elapsed,
+        }
+        chat = self.query_one("#chat-log", RichLog)
+        color = "green" if status == "success" else "red" if status == "failure" else "yellow"
+        chat.write(Text.from_markup(
+            f"[bold {color}]Experiment {status.upper()}[/bold {color}] "
+            f"[dim]({iterations} iterations, {elapsed:.1f}s)[/dim]\n"
+            f"[dim]saved:[/dim] {saved_relative_path}"
+        ))
+        self._add_saved_file(str(self._workspace.root_path / saved_relative_path))
+        self._processing = False
+        self._update_status()
+
+    def _show_experiment_error(self, error: str) -> None:
+        chat = self.query_one("#chat-log", RichLog)
+        chat.write(Text.from_markup(f"[bold red]Experiment failed:[/bold red] {escape(error)}"))
+        self._processing = False
+        self._update_status()
+
+
+def main() -> None:
+    """Entry point for ExperimentWorker Agent TUI."""
+    parser = argparse.ArgumentParser(description="Experiment Worker Agent — Multi-pane TUI")
+    add_model_argument(parser)
+    parser.add_argument(
+        "--workspace-dir",
+        type=str,
+        default="./agent_workspaces/experiment_worker_agent",
+        help="Workspace directory for worker artifacts (default: ./agent_workspaces/experiment_worker_agent)",
+    )
+    parser.add_argument(
+        "--remote-debugging-address",
+        type=str,
+        default="http://127.0.0.1:9222",
+        help="Chrome remote debugging address (set empty string to disable browser tools)",
+    )
+    parser.add_argument(
+        "--cdp-captures-dir",
+        type=Path,
+        default=None,
+        help="Optional captures dir; mounts raw/*.jsonl into workspace for tool/Python access",
+    )
+    parser.add_argument(
+        "--worker-timeout-seconds",
+        type=int,
+        default=DEFAULT_WORKER_TIMEOUT_SECONDS,
+        help=f"Timeout for autonomous worker run (default: {DEFAULT_WORKER_TIMEOUT_SECONDS})",
+    )
+    parser.add_argument(
+        "--max-worker-iterations",
+        type=int,
+        default=DEFAULT_WORKER_MAX_ITERATIONS,
+        help=f"Max autonomous iterations (default: {DEFAULT_WORKER_MAX_ITERATIONS})",
+    )
+    parser.add_argument("-q", "--quiet", action="store_true", help="Suppress logs")
+    parser.add_argument("--log-file", type=str, default=None, help="Log to file")
+    args = parser.parse_args()
+
+    console = Console()
+    llm_model = resolve_model(args.model, console)
+    remote_debugging_address = args.remote_debugging_address.strip() or None
+
+    console.print(f"[dim]Model: {llm_model.value}[/dim]")
+    console.print(f"[dim]Workspace: {args.workspace_dir}[/dim]")
+    console.print(f"[dim]Browser: {remote_debugging_address or '(disabled)'}[/dim]")
+    if args.cdp_captures_dir:
+        console.print(f"[dim]Captures mounted from: {args.cdp_captures_dir}[/dim]")
+    console.print()
+
+    enable_tui_logging(log_file=args.log_file or ".bluebox_experiment_worker_tui.log", quiet=args.quiet)
+
+    app = ExperimentWorkerAgentTUI(
+        llm_model=llm_model,
+        workspace_dir=args.workspace_dir,
+        remote_debugging_address=remote_debugging_address,
+        cdp_captures_dir=args.cdp_captures_dir,
+        worker_timeout_seconds=args.worker_timeout_seconds,
+        max_worker_iterations=args.max_worker_iterations,
+    )
+    app.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bluebox/scripts/specialists/run_interaction_specialist.py b/bluebox/scripts/specialists/run_interaction_specialist.py
index edbc0744..0a52a296 100644
--- a/bluebox/scripts/specialists/run_interaction_specialist.py
+++ b/bluebox/scripts/specialists/run_interaction_specialist.py
@@ -16,6 +16,7 @@
 Usage:
     bluebox-interaction-specialist --jsonl-path ./cdp_captures/interaction/events.jsonl
     bluebox-interaction-specialist --jsonl-path ./cdp_captures/interaction/events.jsonl --model gpt-5.2
+    bluebox-interaction-specialist --jsonl-path ./cdp_captures/interaction/events.jsonl --workspace-dir ./agent_workspace/interaction_specialist
 """
 
 from __future__ import annotations
@@ -35,6 +36,7 @@
 from textual.widgets import RichLog
 
 from bluebox.agents.specialists.interaction_specialist import InteractionSpecialist
+from bluebox.workspace import LocalAgentWorkspace
 from bluebox.data_models.llms.vendors import LLMModel
 from bluebox.data_models.orchestration.result import SpecialistResultWrapper
 from bluebox.llms.data_loaders.interactions_data_loader import InteractionsDataLoader
@@ -79,10 +81,16 @@ def __init__(
         llm_model: LLMModel,
         interaction_store: InteractionsDataLoader,
         data_path: str = "",
+        workspace_dir: str | None = None,
     ) -> None:
-        super().__init__(llm_model)
+        super().__init__(llm_model, working_dir=workspace_dir)
         self._interaction_store = interaction_store
         self._data_path = data_path
+        self._workspace = LocalAgentWorkspace.from_directory_path(
+            workspace_dir or "./agent_workspace/interaction_specialist",
+        )
+        if self._data_path:
+            self._workspace.attach_input_file("interaction_events", self._data_path)
 
     # -- Abstract implementations ----------------------------------------------
 
@@ -92,6 +100,7 @@ def _create_agent(self) -> AbstractAgent:
             stream_chunk_callable=self._handle_stream_chunk,
             interaction_data_loader=self._interaction_store,
             llm_model=self._llm_model,
+            workspace=self._workspace,
         )
 
     def _print_welcome(self) -> None:
@@ -122,11 +131,6 @@ def _print_welcome(self) -> None:
         chat.write(Text.from_markup("\n".join(lines)))
         chat.write("")
 
-        chat.write(Text.from_markup(
-            "Type [cyan]/help[/cyan] for commands, or ask questions about the user interactions."
-        ))
-        chat.write("")
-
     def _build_status_text(self) -> str:
         now = datetime.now().strftime("%Y-%m-%d %H:%M")
         msg_count = len(self._agent.get_chats()) if self._agent else 0
@@ -265,6 +269,12 @@ def main() -> None:
     add_model_argument(parser)
     parser.add_argument("-q", "--quiet", action="store_true", help="Suppress logs")
     parser.add_argument("--log-file", type=str, default=None, help="Log to file")
+    parser.add_argument(
+        "--workspace-dir",
+        type=str,
+        default="./agent_workspace/interaction_specialist",
+        help="Workspace directory for tool results, artifacts, and code execution files.",
+    )
     args = parser.parse_args()
 
     console = Console()
@@ -295,6 +305,7 @@ def main() -> None:
         llm_model=llm_model,
         interaction_store=interaction_store,
         data_path=str(jsonl_path),
+        workspace_dir=args.workspace_dir,
     )
     app.run()
 
diff --git a/bluebox/scripts/specialists/run_js_specialist.py b/bluebox/scripts/specialists/run_js_specialist.py
index d70b6338..e16f2097 100644
--- a/bluebox/scripts/specialists/run_js_specialist.py
+++ b/bluebox/scripts/specialists/run_js_specialist.py
@@ -15,6 +15,7 @@
 
 Usage:
     bluebox-js-specialist
+    bluebox-js-specialist --workspace-dir ./agent_workspace/js_specialist
     bluebox-js-specialist --dom-snapshots-path ./cdp_captures/dom/events.jsonl
     bluebox-js-specialist \
         --dom-snapshots-path ./cdp_captures/dom/events.jsonl \
@@ -40,6 +41,7 @@
 from textual.widgets import RichLog
 
 from bluebox.agents.specialists.js_specialist import JSSpecialist
+from bluebox.workspace import LocalAgentWorkspace
 from bluebox.data_models.dom import DOMSnapshotEvent
 from bluebox.data_models.llms.vendors import LLMModel
 from bluebox.data_models.orchestration.result import SpecialistResultWrapper
@@ -87,13 +89,29 @@ def __init__(
         dom_snapshots: list[DOMSnapshotEvent] | None = None,
         js_data_loader: JSDataLoader | None = None,
         network_data_loader: NetworkDataLoader | None = None,
+        dom_snapshots_path: str | None = None,
+        javascript_events_jsonl_path: str | None = None,
+        network_events_jsonl_path: str | None = None,
         remote_debugging_address: str | None = None,
+        workspace_dir: str | None = None,
     ) -> None:
-        super().__init__(llm_model)
+        super().__init__(llm_model, working_dir=workspace_dir)
         self._dom_snapshots = dom_snapshots
         self._js_data_loader = js_data_loader
         self._network_data_loader = network_data_loader
+        self._dom_snapshots_path = dom_snapshots_path
+        self._javascript_events_jsonl_path = javascript_events_jsonl_path
+        self._network_events_jsonl_path = network_events_jsonl_path
         self._remote_debugging_address = remote_debugging_address
+        self._workspace = LocalAgentWorkspace.from_directory_path(
+            workspace_dir or "./agent_workspace/js_specialist",
+        )
+        if self._dom_snapshots_path:
+            self._workspace.attach_input_file("dom_events", self._dom_snapshots_path)
+        if self._javascript_events_jsonl_path:
+            self._workspace.attach_input_file("javascript_events", self._javascript_events_jsonl_path)
+        if self._network_events_jsonl_path:
+            self._workspace.attach_input_file("network_events", self._network_events_jsonl_path)
 
     # -- Abstract implementations ----------------------------------------------
 
@@ -106,6 +124,7 @@ def _create_agent(self) -> AbstractAgent:
             network_data_loader=self._network_data_loader,
             llm_model=self._llm_model,
             remote_debugging_address=self._remote_debugging_address,
+            workspace=self._workspace,
         )
 
     def _print_welcome(self) -> None:
@@ -142,11 +161,6 @@ def _print_welcome(self) -> None:
             ))
             chat.write("")
 
-        chat.write(Text.from_markup(
-            "Type [cyan]/help[/cyan] for commands, or ask questions about JavaScript."
-        ))
-        chat.write("")
-
     def _build_status_text(self) -> str:
         now = datetime.now().strftime("%Y-%m-%d %H:%M")
         msg_count = len(self._agent.get_chats()) if self._agent else 0
@@ -306,6 +320,12 @@ def main() -> None:
         default=None,
         help="Path to network_events.jsonl file for network traffic analysis tools",
     )
+    parser.add_argument(
+        "--workspace-dir",
+        type=str,
+        default="./agent_workspace/js_specialist",
+        help="Workspace directory for tool results, artifacts, and code execution files.",
+    )
     parser.add_argument("-q", "--quiet", action="store_true", help="Suppress logs")
     parser.add_argument("--log-file", type=str, default=None, help="Log to file")
     args = parser.parse_args()
@@ -374,7 +394,11 @@ def main() -> None:
         dom_snapshots=dom_snapshots,
         js_data_loader=js_data_loader,
         network_data_loader=network_data_loader,
+        dom_snapshots_path=args.dom_snapshots_path,
+        javascript_events_jsonl_path=args.javascript_events_jsonl_path,
+        network_events_jsonl_path=args.network_events_jsonl_path,
         remote_debugging_address=args.remote_debugging_address,
+        workspace_dir=args.workspace_dir,
     )
     app.run()
 
diff --git a/bluebox/scripts/specialists/run_network_specialist.py b/bluebox/scripts/specialists/run_network_specialist.py
index 08915c94..3bfb36b3 100644
--- a/bluebox/scripts/specialists/run_network_specialist.py
+++ b/bluebox/scripts/specialists/run_network_specialist.py
@@ -17,6 +17,7 @@
     bluebox-network-specialist --jsonl-path ./cdp_captures/network/events.jsonl
     bluebox-network-specialist --jsonl-path ./cdp_captures/network/events.jsonl --model gpt-5.2
     bluebox-network-specialist --jsonl-path ./cdp_captures/network/events.jsonl --model claude-sonnet-4-5
+    bluebox-network-specialist --jsonl-path ./cdp_captures/network/events.jsonl --workspace-dir ./agent_workspace/network-specialist
 """
 
 from __future__ import annotations
@@ -36,6 +37,7 @@
 from textual.widgets import RichLog
 
 from bluebox.agents.specialists.network_specialist import NetworkSpecialist
+from bluebox.workspace import LocalAgentWorkspace
 from bluebox.data_models.llms.vendors import LLMModel
 from bluebox.data_models.orchestration.result import SpecialistResultWrapper
 from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
@@ -80,10 +82,16 @@ def __init__(
         llm_model: LLMModel,
         network_store: NetworkDataLoader,
         data_path: str = "",
+        workspace_dir: str | None = None,
     ) -> None:
-        super().__init__(llm_model)
+        super().__init__(llm_model, working_dir=workspace_dir)
         self._network_store = network_store
         self._data_path = data_path
+        self._workspace = LocalAgentWorkspace.from_directory_path(
+            workspace_dir or "./agent_workspace/network_specialist",
+        )
+        if self._data_path:
+            self._workspace.attach_input_file("network_events", self._data_path)
 
     # ── Abstract implementations ─────────────────────────────────────────
 
@@ -93,6 +101,7 @@ def _create_agent(self) -> AbstractAgent:
             stream_chunk_callable=self._handle_stream_chunk,
             network_data_loader=self._network_store,
             llm_model=self._llm_model,
+            workspace=self._workspace,
         )
 
     def _print_welcome(self) -> None:
@@ -160,10 +169,6 @@ def _print_welcome(self) -> None:
             chat.write(Text.from_markup("\n".join(url_lines)))
             chat.write("")
 
-        chat.write(Text.from_markup(
-            "Type [cyan]/help[/cyan] for commands, or ask questions about the network traffic."
-        ))
-        chat.write("")
 
     def _build_status_text(self) -> str:
         now = datetime.now().strftime("%Y-%m-%d %H:%M")
@@ -304,6 +309,12 @@ def main() -> None:
     add_model_argument(parser)
     parser.add_argument("-q", "--quiet", action="store_true", help="Suppress logs")
     parser.add_argument("--log-file", type=str, default=None, help="Log to file")
+    parser.add_argument(
+        "--workspace-dir",
+        type=str,
+        default="./agent_workspace/network_specialist",
+        help="Workspace directory for tool results, artifacts, and code execution files.",
+    )
     args = parser.parse_args()
 
     console = Console()
@@ -334,6 +345,7 @@ def main() -> None:
         llm_model=llm_model,
         network_store=network_store,
         data_path=str(jsonl_path),
+        workspace_dir=args.workspace_dir,
     )
     app.run()
 
diff --git a/bluebox/scripts/specialists/run_routine_inspector_agent.py b/bluebox/scripts/specialists/run_routine_inspector_agent.py
new file mode 100644
index 00000000..c87abd27
--- /dev/null
+++ b/bluebox/scripts/specialists/run_routine_inspector_agent.py
@@ -0,0 +1,623 @@
+"""
+bluebox/scripts/specialists/run_routine_inspector_agent.py
+
+Interactive TUI for RoutineInspector.
+
+Features:
+- Chat directly with the inspector agent.
+- Run `/inspect <routine.json> ['{...}'|@params.json]` to:
+  1) load a routine JSON
+  2) execute it with test params (defaults to test_parameters in file)
+  3) run the inspector autonomously with the same inspection prompt pattern
+     used by PrincipalInvestigator
+
+Autonomous mode example:
+    python -m bluebox.scripts.specialists.run_routine_inspector_agent \
+      --workspace-dir ./agent_workspaces/inspector_agent \
+      --remote-debugging-address http://127.0.0.1:9222 \
+      --model gpt-5.2
+
+Inside TUI:
+    /inspect /absolute/path/to/routine.json {"param":"value"}
+"""
+
+from __future__ import annotations
+
+import argparse
+import ast
+import json
+import re
+import shlex
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+from pydantic import BaseModel
+from rich.console import Console
+from rich.text import Text
+from textual import work
+from textual.widgets import RichLog
+
+from bluebox.agents.abstract_agent import AutonomousRunConfig
+from bluebox.agents.routine_inspector import RoutineInspector
+from bluebox.workspace import LocalAgentWorkspace
+from bluebox.data_models.llms.vendors import LLMModel
+from bluebox.data_models.orchestration.inspection import RoutineInspectionResult
+from bluebox.data_models.routine.execution import RoutineExecutionResultWithMetadata
+from bluebox.data_models.routine.routine import Routine
+from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader
+from bluebox.utils.cli_utils import add_model_argument, resolve_model
+from bluebox.utils.logger import enable_tui_logging, get_logger
+from bluebox.utils.tui_base import AbstractAgentTUI, BASE_HELP_TEXT, BASE_SLASH_COMMANDS
+
+
+logger = get_logger(name=__name__)
+BLUEBOX_PACKAGE_ROOT = Path(__file__).resolve().parent.parent.parent
+
+DEFAULT_INSPECTOR_TIMEOUT_SECONDS = 180
+DEFAULT_INSPECTOR_MAX_ITERATIONS = 10
+DEFAULT_ROUTINE_EXECUTION_TIMEOUT_SECONDS = 120.0
+INSPECTOR_INLINE_EXECUTION_MAX_CHARS = 20_000
+INSPECTION_PROMPT_MAX_CHARS = 120_000
+
+
+def _build_documentation_loader() -> DocumentationDataLoader:
+    """Build the same docs/code loader used by PI for inspector guidance."""
+    docs_dir = str(BLUEBOX_PACKAGE_ROOT / "agent_docs")
+    code_paths = [
+        str(BLUEBOX_PACKAGE_ROOT / "data_models" / "routine"),
+        str(BLUEBOX_PACKAGE_ROOT / "data_models" / "ui_elements.py"),
+        str(BLUEBOX_PACKAGE_ROOT / "agents" / "routine_discovery_agent.py"),
+        str(BLUEBOX_PACKAGE_ROOT / "llms" / "infra" / "data_store.py"),
+        str(BLUEBOX_PACKAGE_ROOT / "utils" / "js_utils.py"),
+        str(BLUEBOX_PACKAGE_ROOT / "utils" / "data_utils.py"),
+        "!" + str(BLUEBOX_PACKAGE_ROOT / "**" / "__init__.py"),
+    ]
+    return DocumentationDataLoader(
+        documentation_paths=[docs_dir],
+        code_paths=code_paths,
+    )
+
+
+def _collect_capture_input_files(cdp_captures_dir: Path) -> list[tuple[str, Path]]:
+    """Collect capture JSONL files and map them to mounted input names."""
+    candidates: list[tuple[str, Path]] = [
+        ("network_events", cdp_captures_dir / "network" / "events.jsonl"),
+        ("storage_events", cdp_captures_dir / "storage" / "events.jsonl"),
+        ("dom_events", cdp_captures_dir / "dom" / "events.jsonl"),
+        ("window_properties_events", cdp_captures_dir / "window_properties" / "events.jsonl"),
+        ("js_events", cdp_captures_dir / "js" / "events.jsonl"),
+        ("interaction_events", cdp_captures_dir / "interaction" / "events.jsonl"),
+    ]
+    return [(name, path) for name, path in candidates if path.exists()]
+
+
+def _mount_capture_inputs(workspace: LocalAgentWorkspace, capture_inputs: list[tuple[str, Path]]) -> None:
+    """Attach capture inputs into workspace raw/."""
+    for name, source_path in capture_inputs:
+        workspace.attach_input_file(name=name, source_path=source_path)
+
+
+def _load_exploration_summaries(exploration_dir: Path | None) -> dict[str, str]:
+    """Load saved exploration summaries as JSON strings for prompt injection."""
+    if exploration_dir is None:
+        return {}
+
+    base = exploration_dir
+    if (base / "exploration").exists():
+        base = base / "exploration"
+
+    summaries: dict[str, str] = {}
+    for domain in ("network", "storage", "dom", "ui"):
+        path = base / f"{domain}.json"
+        if not path.exists():
+            continue
+        try:
+            raw = json.loads(path.read_text())
+            summaries[domain] = json.dumps(raw, indent=2, default=str)
+        except Exception as e:
+            logger.warning("Failed to load exploration summary %s: %s", path, e)
+    return summaries
+
+
+def _resolve_routine_file(routine_path: Path) -> tuple[Routine, dict[str, Any]]:
+    """
+    Resolve a routine file into (Routine, default_test_parameters).
+
+    Expects a raw routine JSON with name, description, operations.
+    Optionally includes test_parameters at the top level.
+    """
+    raw = json.loads(routine_path.read_text())
+    if not isinstance(raw, dict):
+        raise ValueError("Routine file must contain a JSON object")
+
+    if not all(k in raw for k in ("name", "description", "operations")):
+        raise ValueError(
+            "Routine file must contain 'name', 'description', and 'operations' keys.",
+        )
+
+    default_params: dict[str, Any] = {}
+    if isinstance(raw.get("test_parameters"), dict):
+        default_params = raw["test_parameters"]
+
+    routine = Routine.model_validate(raw)
+    return routine, default_params
+
+
+def _parse_params_expression(params_expr: str) -> dict[str, Any]:
+    """Parse params from JSON string, Python dict literal, or @file.json."""
+    expr = params_expr.strip()
+    if not expr:
+        return {}
+
+    if expr.startswith("@"):
+        params_path = Path(expr[1:]).expanduser()
+        payload = json.loads(params_path.read_text())
+        if not isinstance(payload, dict):
+            raise ValueError("Params file must contain a JSON object")
+        return payload
+
+    path_candidate = Path(expr).expanduser()
+    if path_candidate.exists() and path_candidate.is_file():
+        payload = json.loads(path_candidate.read_text())
+        if not isinstance(payload, dict):
+            raise ValueError("Params file must contain a JSON object")
+        return payload
+
+    # Try strict JSON first
+    try:
+        payload = json.loads(expr)
+    except json.JSONDecodeError:
+        # Fallback for Python dict literal input convenience
+        payload = ast.literal_eval(expr)
+
+    if not isinstance(payload, dict):
+        raise ValueError("Params must be an object/dict")
+    return payload
+
+
+class InspectorAgentTUI(AbstractAgentTUI):
+    """TUI for chatting with RoutineInspector and running local inspection mocks."""
+
+    TITLE = "Routine Inspector Agent"
+    SHOW_SAVED_FILES_PANE = True
+    SLASH_COMMANDS = {
+        **BASE_SLASH_COMMANDS,
+        "/inspect": "Execute routine + autonomous inspection: /inspect <routine.json> ['{...}'|@params.json]  (params default to test_parameters in file)",
+    }
+    HELP_TEXT = BASE_HELP_TEXT + (
+        "\n    [cyan]/inspect[/cyan]         Execute routine + run autonomous inspector"
+        "\n                       [dim]Usage: /inspect <routine.json> ['{...}'|@params.json][/dim]"
+        "\n                       [dim]Params: inline JSON/dict, @file.json path, or omit to use test_parameters from file.[/dim]\n"
+    )
+
+    def __init__(
+        self,
+        llm_model: LLMModel,
+        workspace_dir: str,
+        remote_debugging_address: str,
+        cdp_captures_dir: Path | None,
+        exploration_dir: Path | None,
+        inspector_timeout_seconds: int,
+        routine_timeout_seconds: float,
+        max_inspector_iterations: int,
+    ) -> None:
+        super().__init__(llm_model=llm_model, working_dir=workspace_dir)
+        self._workspace_dir = workspace_dir
+        self._remote_debugging_address = remote_debugging_address
+        self._inspector_timeout_seconds = inspector_timeout_seconds
+        self._routine_timeout_seconds = routine_timeout_seconds
+        self._max_inspector_iterations = max_inspector_iterations
+
+        self._workspace = LocalAgentWorkspace.from_directory_path(self._workspace_dir)
+        self._documentation_loader = _build_documentation_loader()
+        self._exploration_summaries = _load_exploration_summaries(exploration_dir)
+
+        if cdp_captures_dir is not None:
+            capture_inputs = _collect_capture_input_files(cdp_captures_dir)
+            _mount_capture_inputs(self._workspace, capture_inputs)
+
+        self._last_inspection: dict[str, Any] | None = None
+
+    def _create_agent(self) -> RoutineInspector:
+        return RoutineInspector(
+            emit_message_callable=self._handle_message,
+            stream_chunk_callable=self._handle_stream_chunk,
+            llm_model=self._llm_model,
+            workspace=self._workspace,
+            documentation_data_loader=self._documentation_loader,
+        )
+
+    def _create_runtime_inspector(self) -> RoutineInspector:
+        """Create a fresh inspector instance for `/inspect` runs."""
+        return RoutineInspector(
+            emit_message_callable=self._handle_message,
+            stream_chunk_callable=self._handle_stream_chunk,
+            llm_model=self._llm_model,
+            workspace=self._workspace,
+            documentation_data_loader=self._documentation_loader,
+        )
+
+    def _print_welcome(self) -> None:
+        chat = self.query_one("#chat-log", RichLog)
+        chat.write(Text.from_markup(
+            "[bold green]Routine Inspector Agent[/bold green]  [dim]chat + autonomous inspection harness[/dim]"
+        ))
+        chat.write("")
+        loaded_domains = ", ".join(sorted(self._exploration_summaries.keys())) or "(none)"
+        chat.write(Text.from_markup(
+            f"[dim]Model:[/dim]       {self._llm_model.value}\n"
+            f"[dim]Browser:[/dim]     {self._remote_debugging_address}\n"
+            f"[dim]Workspace:[/dim]   {self._workspace_dir}\n"
+            f"[dim]Exploration:[/dim] {loaded_domains}\n"
+            f"[dim]Tip:[/dim]         /inspect /abs/path/to/routine.json '{{\"param\": \"value\"}}'"
+        ))
+        chat.write("")
+
+    def _build_status_text(self) -> str:
+        now = datetime.now().strftime("%Y-%m-%d %H:%M")
+        msg_count = len(self._agent.get_chats()) if self._agent else 0
+        tokens_used, ctx_pct = self._estimate_context_usage()
+        ctx_bar = self._context_bar(ctx_pct)
+        last = self._last_inspection or {}
+        last_pass = last.get("overall_pass")
+        last_score = last.get("overall_score")
+        if last_pass is None:
+            last_summary = "n/a"
+        else:
+            verdict = "PASS" if last_pass else "FAIL"
+            last_summary = f"{verdict} ({last_score}/100)"
+        return (
+            f"[bold green]INSPECTOR AGENT[/bold green]\n"
+            f"[dim]──────────────────[/dim]\n"
+            f"[dim]Model:[/dim]     {self._llm_model.value}\n"
+            f"[dim]Messages:[/dim]  {msg_count}\n"
+            f"[dim]Tools:[/dim]     {self._tool_call_count}\n"
+            f"[dim]Last inspect:[/dim] {last_summary}\n"
+            f"[dim]Context:[/dim]   {ctx_bar}\n"
+            f"[dim](est.)      ~{tokens_used:,} / {self._context_window_size:,}[/dim]\n"
+            f"[dim]──────────────────[/dim]\n"
+            f"[dim]Time:[/dim]      {now}\n"
+        )
+
+    def _handle_custom_command(self, cmd: str, raw_input: str) -> bool:
+        if not raw_input.lower().startswith("/inspect"):
+            return False
+
+        chat = self.query_one("#chat-log", RichLog)
+        try:
+            parts = shlex.split(raw_input)
+        except ValueError as e:
+            chat.write(Text.from_markup(f"[red]Invalid command syntax:[/red] {e}"))
+            return True
+
+        if len(parts) < 2:
+            chat.write(Text.from_markup(
+                "[yellow]Usage:[/yellow] /inspect <routine.json> ['{...}'|@params.json]\n"
+                "[dim]Params default to test_parameters in the routine file if omitted.[/dim]"
+            ))
+            return True
+
+        routine_path = Path(parts[1]).expanduser()
+        if not routine_path.exists():
+            chat.write(Text.from_markup(f"[red]Routine file not found:[/red] {routine_path}"))
+            return True
+
+        try:
+            routine, default_params = _resolve_routine_file(routine_path)
+        except Exception as e:
+            chat.write(Text.from_markup(f"[red]Failed to load routine:[/red] {e}"))
+            return True
+
+        params_expr = " ".join(parts[2:]) if len(parts) > 2 else ""
+        try:
+            test_parameters = (
+                _parse_params_expression(params_expr)
+                if params_expr
+                else (default_params or {})
+            )
+        except Exception as e:
+            chat.write(Text.from_markup(f"[red]Failed to parse parameters:[/red] {e}"))
+            return True
+
+        chat.write(Text.from_markup(
+            f"[yellow]Running inspection[/yellow] for [bold]{routine.name}[/bold]\n"
+            f"[dim]Routine file:[/dim] {routine_path}\n"
+            f"[dim]Test params:[/dim] {json.dumps(test_parameters, default=str)}"
+        ))
+
+        self._processing = True
+        self._assistant_header_printed = False
+        self._status_update_printed = False
+        self._inspect_async(
+            routine_path=str(routine_path),
+            routine_data=routine.model_dump(mode="json"),
+            test_parameters=test_parameters,
+        )
+        return True
+
+    def _execute_routine_with_params(
+        self,
+        routine: Routine,
+        test_parameters: dict[str, Any],
+    ) -> RoutineExecutionResultWithMetadata | None:
+        """Execute routine in live browser, mirroring PI settings."""
+        if not self._remote_debugging_address:
+            return None
+        try:
+            return routine.execute(
+                parameters_dict=test_parameters,
+                remote_debugging_address=self._remote_debugging_address,
+                timeout=self._routine_timeout_seconds,
+                close_tab_when_done=True,
+                incognito=True,
+            )
+        except Exception as e:
+            logger.error("Routine execution failed in run_routine_inspector_agent: %s", e)
+            return None
+
+    def _build_inspection_prompt(
+        self,
+        routine: Routine,
+        execution_result: RoutineExecutionResultWithMetadata | None,
+    ) -> str:
+        """Build inspection prompt using the same structure PI uses."""
+        prompt_parts: list[str] = [
+            f"## Routine Name\n{routine.name}\n",
+            f"## Routine Description\n{routine.description}\n",
+            f"## Routine JSON\n```json\n{json.dumps(routine.model_dump(), indent=2, default=str)}\n```\n",
+        ]
+
+        if execution_result is not None:
+            exec_data = execution_result.model_dump(mode="json")
+            exec_json = json.dumps(exec_data, indent=2, default=str)
+
+            persisted_for_inspector = False
+            if len(exec_json) > INSPECTOR_INLINE_EXECUTION_MAX_CHARS:
+                try:
+                    self._workspace.ensure_dirs()
+                    safe_name = re.sub(r"[^a-zA-Z0-9_.-]+", "_", routine.name).strip("_")
+                    if not safe_name:
+                        safe_name = "routine"
+                    artifact_ref = self._workspace.save_artifact(
+                        source="raw",
+                        filename=f"{safe_name}_execution_result.json",
+                        content=exec_json,
+                        tool_name="run_routine_inspector_agent",
+                        content_type="json",
+                        metadata={
+                            "routine_name": routine.name,
+                            "char_count": len(exec_json),
+                        },
+                    )
+                    prompt_parts.append(
+                        "## Execution Result\n"
+                        f"Execution payload is large ({len(exec_json)} chars) and was saved to:\n"
+                        f"- workspace path: `{artifact_ref.relative_path}`\n"
+                        f"- artifact_id: `{artifact_ref.artifact_id}`\n\n"
+                        "Use `execute_python` or `read_file(scope=\"workspace\", path=\"...\")` to inspect "
+                        "this file directly and base your judgment on the full payload.\n"
+                        "Do not claim truncation — the full execution result is available in workspace raw/.\n"
+                    )
+                    persisted_for_inspector = True
+                except Exception as e:
+                    logger.warning(
+                        "Failed to persist large execution payload for inspector run: %s",
+                        e,
+                    )
+
+            if not persisted_for_inspector:
+                prompt_parts.append(f"## Execution Result\n```json\n{exec_json}\n```\n")
+        else:
+            prompt_parts.append("## Execution Result\nNot available (no browser or execution failed).\n")
+
+        base_prompt = "\n".join(prompt_parts)
+
+        exploration_section = ""
+        if self._exploration_summaries:
+            exploration_parts: list[str] = ["## Exploration Summaries\n"]
+            for domain, summary in self._exploration_summaries.items():
+                exploration_parts.append(f"### {domain}\n{summary}\n")
+            exploration_section = "\n".join(exploration_parts)
+
+        inspection_prompt = (
+            f"{base_prompt}\n\n{exploration_section}"
+            if exploration_section
+            else base_prompt
+        )
+        if len(inspection_prompt) > INSPECTION_PROMPT_MAX_CHARS and exploration_section:
+            inspection_prompt = (
+                f"{base_prompt}\n\n"
+                "## Exploration Summaries\n"
+                "[omitted due prompt size; consult persisted exploration artifacts if needed]\n"
+            )
+        return inspection_prompt
+
+    @work(thread=True)
+    def _inspect_async(
+        self,
+        routine_path: str,
+        routine_data: dict[str, Any],
+        test_parameters: dict[str, Any],
+    ) -> None:
+        """Execute routine and run autonomous inspector in a background thread."""
+        try:
+            routine = Routine.model_validate(routine_data)
+            execution_result = self._execute_routine_with_params(routine, test_parameters)
+
+            inspector = self._create_runtime_inspector()
+            inspection_prompt = self._build_inspection_prompt(routine, execution_result)
+            config = AutonomousRunConfig(
+                min_iterations=1,
+                max_iterations=self._max_inspector_iterations,
+            )
+
+            with ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(
+                    inspector.run_autonomous,
+                    task=inspection_prompt,
+                    config=config,
+                    output_schema=RoutineInspectionResult.model_json_schema(),
+                    output_description="RoutineInspectionResult with scores, blocking issues, and verdict",
+                )
+                result = future.result(timeout=self._inspector_timeout_seconds)
+
+            raw_result = result.model_dump() if isinstance(result, BaseModel) else result
+            if not isinstance(raw_result, dict):
+                raw_result = {"result": raw_result}
+
+            inspection_output = raw_result.get("output", raw_result)
+            if not isinstance(inspection_output, dict):
+                inspection_output = {"output": inspection_output}
+
+            record = {
+                "routine_file": routine_path,
+                "routine_name": routine.name,
+                "test_parameters": test_parameters,
+                "execution_result": (
+                    execution_result.model_dump(mode="json")
+                    if execution_result is not None
+                    else None
+                ),
+                "inspection_result": raw_result,
+            }
+            ts = datetime.now().strftime("%Y%m%d-%H%M%S")
+            filename = f"inspection_{routine.name}_{ts}.json"
+            ref = self._workspace.save_artifact(
+                source="output",
+                filename=filename,
+                content=json.dumps(record, indent=2, default=str),
+                tool_name="run_routine_inspector_agent",
+                content_type="json",
+                metadata={"routine_name": routine.name},
+            )
+
+            self.call_from_thread(
+                self._show_inspection_success,
+                routine.name,
+                inspection_output,
+                ref.relative_path,
+            )
+        except FuturesTimeoutError:
+            self.call_from_thread(
+                self._show_inspection_error,
+                f"Inspector timed out after {self._inspector_timeout_seconds}s",
+            )
+        except Exception as e:
+            self.call_from_thread(self._show_inspection_error, str(e))
+
+    def _show_inspection_success(
+        self,
+        routine_name: str,
+        inspection_output: dict[str, Any],
+        saved_relative_path: str,
+    ) -> None:
+        """Render inspection summary in chat and update status."""
+        self._last_inspection = inspection_output
+        overall_pass = inspection_output.get("overall_pass")
+        overall_score = inspection_output.get("overall_score")
+        blocking = inspection_output.get("blocking_issues", [])
+        recommendations = inspection_output.get("recommendations", [])
+
+        chat = self.query_one("#chat-log", RichLog)
+        verdict = "PASS" if overall_pass else "FAIL"
+        color = "green" if overall_pass else "red"
+        chat.write(Text.from_markup(
+            f"[bold {color}]Inspection {verdict}[/bold {color}] for [bold]{routine_name}[/bold] "
+            f"(score: {overall_score})\n"
+            f"[dim]blocking_issues:[/dim] {len(blocking)}   "
+            f"[dim]recommendations:[/dim] {len(recommendations)}\n"
+            f"[dim]saved:[/dim] {saved_relative_path}"
+        ))
+
+        self._add_saved_file(str(self._workspace.root_path / saved_relative_path))
+        self._processing = False
+        self._update_status()
+
+    def _show_inspection_error(self, error: str) -> None:
+        """Render inspection error in chat and update status."""
+        chat = self.query_one("#chat-log", RichLog)
+        chat.write(Text.from_markup(f"[bold red]Inspection failed:[/bold red] {error}"))
+        self._processing = False
+        self._update_status()
+
+
+def main() -> None:
+    """Entry point for InspectorAgent TUI."""
+    parser = argparse.ArgumentParser(description="Routine Inspector Agent — Multi-pane TUI")
+    add_model_argument(parser)
+    parser.add_argument(
+        "--workspace-dir",
+        type=str,
+        default="./agent_workspaces/inspector_agent",
+        help="Workspace directory for inspector artifacts (default: ./agent_workspaces/inspector_agent)",
+    )
+    parser.add_argument(
+        "--remote-debugging-address",
+        type=str,
+        default="http://127.0.0.1:9222",
+        help="Chrome remote debugging address for routine execution",
+    )
+    parser.add_argument(
+        "--cdp-captures-dir",
+        type=Path,
+        default=None,
+        help="Optional captures dir; mounts raw/*.jsonl into workspace for execute_python/read_file access",
+    )
+    parser.add_argument(
+        "--exploration-dir",
+        type=Path,
+        default=None,
+        help="Optional exploration dir (or api_indexing output dir containing exploration/) to mirror PI inspection context",
+    )
+    parser.add_argument(
+        "--inspector-timeout-seconds",
+        type=int,
+        default=DEFAULT_INSPECTOR_TIMEOUT_SECONDS,
+        help=f"Timeout for autonomous inspector run (default: {DEFAULT_INSPECTOR_TIMEOUT_SECONDS})",
+    )
+    parser.add_argument(
+        "--routine-timeout-seconds",
+        type=float,
+        default=DEFAULT_ROUTINE_EXECUTION_TIMEOUT_SECONDS,
+        help=f"Timeout for routine.execute (default: {DEFAULT_ROUTINE_EXECUTION_TIMEOUT_SECONDS})",
+    )
+    parser.add_argument(
+        "--max-inspector-iterations",
+        type=int,
+        default=DEFAULT_INSPECTOR_MAX_ITERATIONS,
+        help=f"Max autonomous iterations for inspector (default: {DEFAULT_INSPECTOR_MAX_ITERATIONS})",
+    )
+    parser.add_argument("-q", "--quiet", action="store_true", help="Suppress logs")
+    parser.add_argument("--log-file", type=str, default=None, help="Log to file")
+    args = parser.parse_args()
+
+    console = Console()
+    llm_model = resolve_model(args.model, console)
+    console.print(f"[dim]Model: {llm_model.value}[/dim]")
+    console.print(f"[dim]Workspace: {args.workspace_dir}[/dim]")
+    console.print(f"[dim]Browser: {args.remote_debugging_address}[/dim]")
+    if args.cdp_captures_dir:
+        console.print(f"[dim]Captures mounted from: {args.cdp_captures_dir}[/dim]")
+    if args.exploration_dir:
+        console.print(f"[dim]Exploration context: {args.exploration_dir}[/dim]")
+    console.print()
+
+    enable_tui_logging(log_file=args.log_file or ".bluebox_inspector_tui.log", quiet=args.quiet)
+
+    app = InspectorAgentTUI(
+        llm_model=llm_model,
+        workspace_dir=args.workspace_dir,
+        remote_debugging_address=args.remote_debugging_address,
+        cdp_captures_dir=args.cdp_captures_dir,
+        exploration_dir=args.exploration_dir,
+        inspector_timeout_seconds=args.inspector_timeout_seconds,
+        routine_timeout_seconds=args.routine_timeout_seconds,
+        max_inspector_iterations=args.max_inspector_iterations,
+    )
+    app.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bluebox/scripts/specialists/run_value_trace_resolver_specialist.py b/bluebox/scripts/specialists/run_value_trace_resolver_specialist.py
index 128b5e12..8891980e 100644
--- a/bluebox/scripts/specialists/run_value_trace_resolver_specialist.py
+++ b/bluebox/scripts/specialists/run_value_trace_resolver_specialist.py
@@ -16,6 +16,7 @@
 Usage:
     bluebox-value-trace-resolver-specialist --network-jsonl ./cdp_captures/network/events.jsonl
     bluebox-value-trace-resolver-specialist --storage-jsonl ./cdp_captures/storage/events.jsonl
+    bluebox-value-trace-resolver-specialist --workspace-dir ./agent_workspace/value_trace_resolver_specialist
     bluebox-value-trace-resolver-specialist \
         --network-jsonl ./cdp_captures/network/events.jsonl \
         --storage-jsonl ./cdp_captures/storage/events.jsonl \
@@ -39,6 +40,7 @@
 from textual.widgets import RichLog
 
 from bluebox.agents.specialists.value_trace_resolver_specialist import ValueTraceResolverSpecialist
+from bluebox.workspace import LocalAgentWorkspace
 from bluebox.data_models.llms.vendors import LLMModel
 from bluebox.data_models.orchestration.result import SpecialistResultWrapper
 from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader
@@ -86,11 +88,27 @@ def __init__(
         network_store: NetworkDataLoader | None = None,
         storage_store: StorageDataLoader | None = None,
         window_store: WindowPropertyDataLoader | None = None,
+        network_jsonl_path: str | None = None,
+        storage_jsonl_path: str | None = None,
+        window_props_jsonl_path: str | None = None,
+        workspace_dir: str | None = None,
     ) -> None:
-        super().__init__(llm_model)
+        super().__init__(llm_model, working_dir=workspace_dir)
         self._network_store = network_store
         self._storage_store = storage_store
         self._window_store = window_store
+        self._network_jsonl_path = network_jsonl_path
+        self._storage_jsonl_path = storage_jsonl_path
+        self._window_props_jsonl_path = window_props_jsonl_path
+        self._workspace = LocalAgentWorkspace.from_directory_path(
+            workspace_dir or "./agent_workspace/value_trace_resolver_specialist",
+        )
+        if self._network_jsonl_path:
+            self._workspace.attach_input_file("network_events", self._network_jsonl_path)
+        if self._storage_jsonl_path:
+            self._workspace.attach_input_file("storage_events", self._storage_jsonl_path)
+        if self._window_props_jsonl_path:
+            self._workspace.attach_input_file("window_property_events", self._window_props_jsonl_path)
 
     # -- Abstract implementations ----------------------------------------------
 
@@ -102,6 +120,7 @@ def _create_agent(self) -> AbstractAgent:
             storage_data_loader=self._storage_store,
             window_property_data_loader=self._window_store,
             llm_model=self._llm_model,
+            workspace=self._workspace,
         )
 
     def _print_welcome(self) -> None:
@@ -150,11 +169,6 @@ def _print_welcome(self) -> None:
         chat.write(Text.from_markup("\n".join(lines)))
         chat.write("")
 
-        chat.write(Text.from_markup(
-            "Type [cyan]/help[/cyan] for commands, or ask questions about where values came from."
-        ))
-        chat.write("")
-
     def _build_status_text(self) -> str:
         now = datetime.now().strftime("%Y-%m-%d %H:%M")
         msg_count = len(self._agent.get_chats()) if self._agent else 0
@@ -306,6 +320,12 @@ def main() -> None:
         help="Path to JSONL file containing WindowPropertyEvent entries",
     )
     add_model_argument(parser)
+    parser.add_argument(
+        "--workspace-dir",
+        type=str,
+        default="./agent_workspace/value_trace_resolver_specialist",
+        help="Workspace directory for tool results, artifacts, and code execution files.",
+    )
     parser.add_argument("-q", "--quiet", action="store_true", help="Suppress logs")
     parser.add_argument("--log-file", type=str, default=None, help="Log to file")
     args = parser.parse_args()
@@ -373,6 +393,10 @@ def main() -> None:
         network_store=network_store,
         storage_store=storage_store,
         window_store=window_store,
+        network_jsonl_path=args.network_jsonl,
+        storage_jsonl_path=args.storage_jsonl,
+        window_props_jsonl_path=args.window_props_jsonl,
+        workspace_dir=args.workspace_dir,
     )
     app.run()
 
diff --git a/bluebox/utils/code_execution_sandbox.py b/bluebox/utils/code_execution_sandbox.py
index 5086fd44..68257024 100644
--- a/bluebox/utils/code_execution_sandbox.py
+++ b/bluebox/utils/code_execution_sandbox.py
@@ -120,9 +120,9 @@ def register_lambda_executor(fn: Any) -> None:
 # Workaround hints for blocked modules — returned to the LLM when a block triggers
 BLOCKED_MODULE_WORKAROUNDS: dict[str, str] = {
     "os": "Use open() for file I/O and Path for paths — both are pre-loaded and scoped to the workspace.",
-    "pathlib": "Path is already pre-loaded in your environment. Use it directly without importing: Path('outputs/file.txt').",
+    "pathlib": "Path is already pre-loaded in your environment. Use it directly without importing: Path('output/file.txt').",
     "shutil": "Use open() to read source and write to destination for copying files.",
-    "tempfile": "Write temporary files to the outputs/ directory using open().",
+    "tempfile": "Write temporary files to the output/ directory using open().",
     "glob": "Use Path('.').glob('pattern') or Path('.').rglob('pattern') — Path is pre-loaded.",
     "fnmatch": "Use Path('.').glob('pattern') — Path is pre-loaded.",
     "io": "Use open() for file operations and str for string building.",
@@ -134,7 +134,7 @@ def register_lambda_executor(fn: Any) -> None:
     "inspect": "Use dict access (obj['key'] or obj.get('key')) instead of inspecting objects.",
     "pickle": "Use json.dumps()/json.loads() for serialization instead.",
     "marshal": "Use json.dumps()/json.loads() for serialization instead.",
-    "shelve": "Use json for data persistence. Write to outputs/ with open().",
+    "shelve": "Use json for data persistence. Write to output/ with open().",
     "sqlite3": "Database access is not available. Use JSON/CSV files for data storage.",
     "socket": "Network access is not available in the sandbox.",
     "requests": "Network access is not available in the sandbox.",
@@ -146,7 +146,7 @@ def register_lambda_executor(fn: Any) -> None:
 # Note: "open(" is skipped when work_dir is set (allow_file_io=True), so its workaround
 # only fires for callers that don't provide a work_dir.
 BLOCKED_PATTERN_WORKAROUNDS: dict[str, str] = {
-    "open(": "open() IS available when a workspace is configured. Use it directly: open('outputs/file.csv', 'w').",
+    "open(": "open() IS available when a workspace is configured. Use it directly: open('output/file.csv', 'w').",
     "getattr(": "Use dict-style access instead: obj['key'] or obj.get('key', default).",
     "setattr(": "Use dict-style assignment instead: obj['key'] = value.",
     "delattr(": "Use dict-style deletion instead: del obj['key'].",
@@ -252,6 +252,7 @@ def _execute_in_docker(
     code: str,
     extra_globals: dict[str, Any] | None = None,
     work_dir: str | None = None,
+    read_only_paths: list[str] | None = None,
 ) -> dict[str, Any]:
     """
     Execute Python code in an isolated Docker container.
@@ -268,6 +269,7 @@ def _execute_in_docker(
         code: Python source code to execute.
         extra_globals: Variables to inject (serialized as JSON).
         work_dir: Host directory to mount as /data with read-write access.
+        read_only_paths: Optional list of paths under work_dir to mount read-only.
 
     Returns:
         Dict with 'output' and optionally 'error'.
@@ -325,6 +327,16 @@ def _execute_in_docker(
     if work_dir:
         abs_work_dir = os.path.abspath(work_dir)
         docker_cmd.extend(["-v", f"{abs_work_dir}:/data:rw", "-w", "/data"])
+        for ro_path in read_only_paths or []:
+            abs_ro_path = os.path.abspath(ro_path)
+            if not os.path.exists(abs_ro_path):
+                continue
+            if not (abs_ro_path == abs_work_dir or abs_ro_path.startswith(abs_work_dir + os.sep)):
+                logger.warning("Skipping read-only mount outside work_dir: %s", abs_ro_path)
+                continue
+            rel_ro = os.path.relpath(abs_ro_path, abs_work_dir)
+            container_ro = "/data" if rel_ro == "." else f"/data/{rel_ro}"
+            docker_cmd.extend(["-v", f"{abs_ro_path}:{container_ro}:ro"])
 
     docker_cmd.extend([DOCKER_IMAGE, "python", "-"])
 
@@ -413,7 +425,260 @@ def create_safe_builtins() -> dict[str, Any]:
     return safe_builtins
 
 
-def _create_scoped_open(work_dir: str) -> Any:
+def _is_within_scoped_root(root: str, candidate: str) -> bool:
+    """Return True when candidate path is equal to or under root."""
+    return candidate == root or candidate.startswith(root + os.sep)
+
+
+def _resolve_scoped_path(work_dir: str, file: str | os.PathLike[str]) -> str:
+    """
+    Resolve a user-provided path under work_dir and reject escapes.
+
+    Resolves symlinks in existing path components to prevent symlink-based
+    escapes from bypassing scope checks.
+    """
+    raw = os.fspath(file)
+    joined = raw if os.path.isabs(raw) else os.path.join(work_dir, raw)
+    resolved = os.path.realpath(joined)
+    if not _is_within_scoped_root(work_dir, resolved):
+        raise PermissionError(f"Access denied: path '{raw}' is outside the working directory")
+    return resolved
+
+
+def _assert_path_writable(
+    resolved_path: str,
+    *,
+    raw_path: str,
+    read_only_paths: tuple[str, ...],
+) -> None:
+    """Reject writes into configured read-only paths."""
+    for ro_path in read_only_paths:
+        if _is_within_scoped_root(ro_path, resolved_path):
+            raise PermissionError(f"Access denied: path '{raw_path}' is read-only")
+
+
+def _create_scoped_path(
+    work_dir: str,
+    scoped_open: Any,
+    read_only_paths: list[str] | None = None,
+) -> type:
+    """
+    Create a Path-like class scoped to work_dir.
+
+    This prevents bypasses where pathlib operations write outside work_dir while
+    still allowing familiar Path ergonomics inside the sandbox.
+    """
+    abs_work_dir = os.path.realpath(os.path.abspath(work_dir))
+    normalized_read_only = tuple(os.path.realpath(os.path.abspath(p)) for p in (read_only_paths or []))
+
+    class ScopedPath:
+        __slots__ = ("__resolved",)
+
+        def __init__(self, *parts: Any) -> None:
+            if not parts:
+                parts = (".",)
+            normalized_parts: list[str] = []
+            for part in parts:
+                if isinstance(part, ScopedPath):
+                    normalized_parts.append(str(part.__resolved))
+                else:
+                    normalized_parts.append(os.fspath(part))
+            joined = pathlib.Path(*normalized_parts)
+            resolved = _resolve_scoped_path(abs_work_dir, os.fspath(joined))
+            self.__resolved = pathlib.Path(resolved)
+
+        @property
+        def name(self) -> str:
+            return self.__resolved.name
+
+        @property
+        def suffix(self) -> str:
+            return self.__resolved.suffix
+
+        @property
+        def stem(self) -> str:
+            return self.__resolved.stem
+
+        @property
+        def parent(self) -> Any:
+            return ScopedPath(str(self.__resolved.parent))
+
+        @property
+        def parts(self) -> tuple[str, ...]:
+            return self.__resolved.parts
+
+        @property
+        def resolved_path(self) -> str:
+            """Absolute, scoped path string."""
+            return str(self.__resolved)
+
+        def __fspath__(self) -> str:
+            return str(self.__resolved)
+
+        def __str__(self) -> str:
+            return str(self.__resolved)
+
+        def __repr__(self) -> str:
+            return f"Path('{self.__resolved}')"
+
+        def __eq__(self, other: object) -> bool:
+            if isinstance(other, ScopedPath):
+                return self.__resolved == other.__resolved
+            if isinstance(other, (str, os.PathLike)):
+                try:
+                    return str(self.__resolved) == _resolve_scoped_path(abs_work_dir, os.fspath(other))
+                except PermissionError:
+                    return False
+            return False
+
+        def __hash__(self) -> int:
+            return hash(str(self.__resolved))
+
+        def __truediv__(self, other: Any) -> Any:
+            return self.joinpath(other)
+
+        def joinpath(self, *others: Any) -> Any:
+            return ScopedPath(str(self.__resolved), *others)
+
+        @classmethod
+        def cwd(cls) -> Any:
+            return cls(".")
+
+        def resolve(self, strict: bool = False) -> Any:
+            if strict and not self.__resolved.exists():
+                raise FileNotFoundError(str(self.__resolved))
+            return ScopedPath(str(self.__resolved))
+
+        def exists(self) -> bool:
+            return self.__resolved.exists()
+
+        def is_file(self) -> bool:
+            return self.__resolved.is_file()
+
+        def is_dir(self) -> bool:
+            return self.__resolved.is_dir()
+
+        def open(self, mode: str = "r", *args: Any, **kwargs: Any) -> Any:
+            return scoped_open(str(self.__resolved), mode, *args, **kwargs)
+
+        def read_text(
+            self,
+            encoding: str | None = None,
+            errors: str | None = None,
+        ) -> str:
+            with self.open("r", encoding=encoding, errors=errors) as f:
+                return f.read()
+
+        def write_text(
+            self,
+            data: str,
+            encoding: str | None = None,
+            errors: str | None = None,
+            newline: str | None = None,
+        ) -> int:
+            with self.open("w", encoding=encoding, errors=errors, newline=newline) as f:
+                return f.write(data)
+
+        def read_bytes(self) -> bytes:
+            with self.open("rb") as f:
+                return f.read()
+
+        def write_bytes(self, data: bytes) -> int:
+            with self.open("wb") as f:
+                return f.write(data)
+
+        def mkdir(
+            self,
+            mode: int = 0o777,
+            parents: bool = False,
+            exist_ok: bool = False,
+        ) -> None:
+            _assert_path_writable(
+                str(self.__resolved),
+                raw_path=str(self),
+                read_only_paths=normalized_read_only,
+            )
+            self.__resolved.mkdir(mode=mode, parents=parents, exist_ok=exist_ok)
+
+        def touch(self, mode: int = 0o666, exist_ok: bool = True) -> None:
+            _assert_path_writable(
+                str(self.__resolved),
+                raw_path=str(self),
+                read_only_paths=normalized_read_only,
+            )
+            self.__resolved.touch(mode=mode, exist_ok=exist_ok)
+
+        def unlink(self, missing_ok: bool = False) -> None:
+            _assert_path_writable(
+                str(self.__resolved),
+                raw_path=str(self),
+                read_only_paths=normalized_read_only,
+            )
+            self.__resolved.unlink(missing_ok=missing_ok)
+
+        def rmdir(self) -> None:
+            _assert_path_writable(
+                str(self.__resolved),
+                raw_path=str(self),
+                read_only_paths=normalized_read_only,
+            )
+            self.__resolved.rmdir()
+
+        def rename(self, target: Any) -> Any:
+            target_path = ScopedPath(target)
+            _assert_path_writable(
+                str(self.__resolved),
+                raw_path=str(self),
+                read_only_paths=normalized_read_only,
+            )
+            _assert_path_writable(
+                str(target_path.__resolved),
+                raw_path=os.fspath(target),
+                read_only_paths=normalized_read_only,
+            )
+            renamed = self.__resolved.rename(str(target_path.__resolved))
+            return ScopedPath(str(renamed))
+
+        def replace(self, target: Any) -> Any:
+            target_path = ScopedPath(target)
+            _assert_path_writable(
+                str(self.__resolved),
+                raw_path=str(self),
+                read_only_paths=normalized_read_only,
+            )
+            _assert_path_writable(
+                str(target_path.__resolved),
+                raw_path=os.fspath(target),
+                read_only_paths=normalized_read_only,
+            )
+            replaced = self.__resolved.replace(str(target_path.__resolved))
+            return ScopedPath(str(replaced))
+
+        def iterdir(self) -> Any:
+            for child in self.__resolved.iterdir():
+                try:
+                    yield ScopedPath(str(child))
+                except PermissionError:
+                    continue
+
+        def glob(self, pattern: str) -> Any:
+            for child in self.__resolved.glob(pattern):
+                try:
+                    yield ScopedPath(str(child))
+                except PermissionError:
+                    continue
+
+        def rglob(self, pattern: str) -> Any:
+            for child in self.__resolved.rglob(pattern):
+                try:
+                    yield ScopedPath(str(child))
+                except PermissionError:
+                    continue
+
+    return ScopedPath
+
+
+def _create_scoped_open(work_dir: str, read_only_paths: list[str] | None = None) -> Any:
     """
     Create an open() function that restricts file access to work_dir.
 
@@ -422,18 +687,27 @@ def _create_scoped_open(work_dir: str) -> Any:
 
     Args:
         work_dir: The directory to scope file access to.
+        read_only_paths: Optional absolute paths under work_dir that cannot
+            be opened in write/append/update modes.
     """
-    abs_work_dir = os.path.abspath(work_dir)
+    abs_work_dir = os.path.realpath(os.path.abspath(work_dir))
+    normalized_read_only = tuple(os.path.realpath(os.path.abspath(p)) for p in (read_only_paths or []))
 
     def scoped_open(
-        file: str,
+        file: str | os.PathLike[str],
         mode: str = "r",
         *args: Any,
         **kwargs: Any,
     ) -> Any:
-        resolved = os.path.abspath(os.path.join(abs_work_dir, file))
-        if not resolved.startswith(abs_work_dir + os.sep) and resolved != abs_work_dir:
-            raise PermissionError(f"Access denied: path '{file}' is outside the working directory")
+        raw_file = os.fspath(file)
+        resolved = _resolve_scoped_path(abs_work_dir, raw_file)
+        write_intent = ("w" in mode) or ("a" in mode) or ("x" in mode) or ("+" in mode)
+        if write_intent:
+            _assert_path_writable(
+                resolved,
+                raw_path=raw_file,
+                read_only_paths=normalized_read_only,
+            )
         return open(resolved, mode, *args, **kwargs)  # noqa: SIM115
 
     return scoped_open
@@ -443,6 +717,7 @@ def _execute_blocklist_sandbox(
     code: str,
     extra_globals: dict[str, Any] | None = None,
     work_dir: str | None = None,
+    read_only_paths: list[str] | None = None,
 ) -> dict[str, Any]:
     """
     Execute code using blocklist-based sandboxing (fallback method).
@@ -454,6 +729,8 @@ def _execute_blocklist_sandbox(
         code: Python source code to execute.
         extra_globals: Variables to inject into the execution namespace.
         work_dir: When set, allows scoped file access to this directory.
+        read_only_paths: Optional list of absolute paths under work_dir that
+            cannot be written.
     """
     # Capture stdout
     old_stdout = sys.stdout
@@ -470,7 +747,11 @@ def _execute_blocklist_sandbox(
             os.makedirs(abs_work_dir, exist_ok=True)
             os.chdir(abs_work_dir)
             # Restore open() scoped to work_dir
-            safe_builtins["open"] = _create_scoped_open(abs_work_dir)
+            scoped_open = _create_scoped_open(
+                abs_work_dir,
+                read_only_paths=read_only_paths,
+            )
+            safe_builtins["open"] = scoped_open
 
         exec_globals: dict[str, Any] = {
             "__builtins__": safe_builtins,
@@ -479,7 +760,11 @@ def _execute_blocklist_sandbox(
 
         if work_dir:
             exec_globals["csv"] = csv
-            exec_globals["Path"] = pathlib.Path
+            exec_globals["Path"] = _create_scoped_path(
+                abs_work_dir,
+                scoped_open,
+                read_only_paths=read_only_paths,
+            )
 
         # Add any extra globals
         if extra_globals:
@@ -505,6 +790,7 @@ def execute_python_sandboxed(
     code: str,
     extra_globals: dict[str, Any] | None = None,
     work_dir: str | None = None,
+    read_only_paths: list[str] | None = None,
 ) -> dict[str, Any]:
     """
     Execute Python code in a sandboxed environment.
@@ -527,6 +813,7 @@ def execute_python_sandboxed(
         work_dir: When set, grants read/write file access scoped to this directory.
             In Docker mode, the directory is mounted as /data. In blocklist mode,
             open() is scoped to this directory. Not supported in Lambda mode.
+        read_only_paths: Optional paths under work_dir that are read-only.
 
     Returns:
         Dict with 'output' (stdout) and optionally 'error' if execution failed.
@@ -539,6 +826,15 @@ def execute_python_sandboxed(
         work_dir = os.path.abspath(work_dir)
         if any(work_dir == p or work_dir.startswith(p + os.sep) for p in SENSITIVE_PATH_PREFIXES):
             return {"error": f"work_dir points to a sensitive system path: {work_dir}"}
+    if read_only_paths and not work_dir:
+        return {"error": "read_only_paths requires work_dir"}
+
+    normalized_read_only_paths: list[str] = []
+    for ro_path in read_only_paths or []:
+        abs_ro_path = os.path.abspath(ro_path)
+        if work_dir and not (abs_ro_path == work_dir or abs_ro_path.startswith(work_dir + os.sep)):
+            return {"error": f"read_only_path is outside work_dir: {ro_path}"}
+        normalized_read_only_paths.append(abs_ro_path)
 
     # Check for blocked patterns (allow open() when work_dir is set)
     safety_error = check_code_safety(code, allow_file_io=bool(work_dir))
@@ -575,11 +871,21 @@ def execute_python_sandboxed(
 
     if use_docker:
         logger.debug("Executing code in Docker sandbox (work_dir=%s)", work_dir)
-        return _execute_in_docker(code, extra_globals, work_dir=work_dir)
+        return _execute_in_docker(
+            code,
+            extra_globals,
+            work_dir=work_dir,
+            read_only_paths=normalized_read_only_paths,
+        )
     else:
         if SANDBOX_MODE == "auto":
             logger.warning(
                 "Docker unavailable, using blocklist sandbox. "
                 "This is not secure against adversarial input."
             )
-        return _execute_blocklist_sandbox(code, extra_globals, work_dir=work_dir)
+        return _execute_blocklist_sandbox(
+            code,
+            extra_globals,
+            work_dir=work_dir,
+            read_only_paths=normalized_read_only_paths,
+        )
diff --git a/bluebox/utils/infra_utils.py b/bluebox/utils/infra_utils.py
index ca82e038..ea1ab51b 100644
--- a/bluebox/utils/infra_utils.py
+++ b/bluebox/utils/infra_utils.py
@@ -91,6 +91,7 @@ def read_file_lines(
     start_line: int | None = None,
     end_line: int | None = None,
     max_lines: int = 200,
+    max_characters: int = 5_000,
 ) -> dict[str, Any]:
     """
     Read a text file with optional line range, streaming to avoid loading
@@ -116,6 +117,8 @@ def read_file_lines(
 
     lines: list[str] = []
     total_lines = 0
+    read_line_start: int | None = None
+    read_line_end: int | None = None
     try:
         with file_path.open("r") as f:
             for i, raw in enumerate(f):
@@ -124,6 +127,10 @@ def read_file_lines(
                     if not has_range and len(lines) >= max_lines:
                         continue
                     lines.append(raw.rstrip("\n"))
+                    line_num = i + 1
+                    if read_line_start is None:
+                        read_line_start = line_num
+                    read_line_end = line_num
                 if upper is not None and total_lines >= upper:
                     remaining = sum(1 for _ in f)
                     total_lines += remaining
@@ -143,9 +150,21 @@ def read_file_lines(
         else:
             line_range = f"all {total_lines} lines"
 
+    content = "\n".join(lines)
+    if max_characters > 0 and len(content) > max_characters:
+        line_start = read_line_start or (start_line or 1)
+        line_end = read_line_end or line_start
+        content = (
+            content[:max_characters]
+            + f"\n... [output too large... read lines {line_start} - {line_end}]"
+        )
+
     return {
         "line_range": line_range,
-        "content": "\n".join(lines),
+        "lines_read": len(lines),
+        "read_line_start": read_line_start,
+        "read_line_end": read_line_end,
+        "content": content,
     }
 
 
diff --git a/bluebox/utils/js_utils.py b/bluebox/utils/js_utils.py
index 4638ac9b..db9b6811 100644
--- a/bluebox/utils/js_utils.py
+++ b/bluebox/utils/js_utils.py
@@ -10,6 +10,7 @@
 - generate_type_js(): Input text typing with clear option
 - generate_scroll_element_js(), generate_scroll_window_js(): Scrolling
 - generate_wait_for_url_js(): URL regex matching
+- generate_get_dom_js(): Filtered DOM snapshot for browser_get_dom tool
 - generate_js_evaluate_wrapper_js(): Custom JS execution wrapper
 - _get_placeholder_resolution_js_helpers(): sessionStorage/localStorage/cookie access
 """
@@ -753,6 +754,75 @@ def generate_wait_for_url_js(url_regex: str) -> str:
     """)
 
 
+def generate_get_dom_js(
+    selector: str | None = None,
+    max_depth: int = 5,
+    include_tags: list[str] | None = None,
+) -> str:
+    """
+    Generate JavaScript to capture a filtered DOM tree snapshot.
+
+    Args:
+        selector: Optional CSS selector to scope traversal root.
+        max_depth: Maximum traversal depth from the chosen root.
+        include_tags: Optional list of allowed tag names; when provided, other
+            tags are skipped while still traversing descendants.
+
+    Returns:
+        JavaScript expression string for Runtime.evaluate.
+    """
+    selector_js = json.dumps(selector) if selector is not None else "null"
+    include_tags_js = json.dumps(include_tags) if include_tags else "null"
+    max_depth_js = int(max_depth)
+    return dedent(f"""\
+        (function(selector, maxDepth, includeTags) {{
+          function walk(node, depth) {{
+            if (depth > maxDepth) return null;
+            var tag = node.nodeName.toLowerCase();
+            if (includeTags && includeTags.length > 0 && !includeTags.includes(tag) && tag !== '#document' && tag !== 'html' && tag !== 'head' && tag !== 'body') {{
+              var kids = [];
+              for (var i = 0; i < node.childNodes.length; i++) {{
+                var c = walk(node.childNodes[i], depth);
+                if (c) kids.push(c);
+              }}
+              return kids.length === 1 ? kids[0] : kids.length > 1 ? kids : null;
+            }}
+            if (node.nodeType === 3) {{
+              var text = node.textContent.trim();
+              return text ? text.substring(0, 200) : null;
+            }}
+            if (node.nodeType !== 1) return null;
+            var obj = {{ tag: tag }};
+            var attrs = {{}};
+            for (var j = 0; j < node.attributes.length; j++) {{
+              var a = node.attributes[j];
+              if (['id', 'class', 'name', 'type', 'href', 'src', 'action', 'method', 'value', 'placeholder', 'role', 'aria-label', 'data-testid'].includes(a.name)) {{
+                attrs[a.name] = a.value.substring(0, 200);
+              }}
+            }}
+            if (Object.keys(attrs).length) obj.attrs = attrs;
+            var children = [];
+            for (var k = 0; k < node.childNodes.length; k++) {{
+              var child = walk(node.childNodes[k], depth + 1);
+              if (child) {{
+                if (Array.isArray(child)) children = children.concat(child);
+                else children.push(child);
+              }}
+            }}
+            if (children.length) obj.children = children;
+            return obj;
+          }}
+          var root = selector ? document.querySelector(selector) : document.documentElement;
+          if (!root) return JSON.stringify({{ error: 'selector not found: ' + selector }});
+          var result = JSON.stringify(walk(root, 0));
+          if (result.length > 15000) {{
+            return result.substring(0, 15000) + '... [TRUNCATED at 15K chars]';
+          }}
+          return result;
+        }})({selector_js}, {max_depth_js}, {include_tags_js})
+    """)
+
+
 def generate_store_in_session_storage_js(key: str, value_json: str) -> str:
     """
     Generate JavaScript to store a value in session storage.
diff --git a/bluebox/utils/logger.py b/bluebox/utils/logger.py
index 84bda0dd..be80bc61 100644
--- a/bluebox/utils/logger.py
+++ b/bluebox/utils/logger.py
@@ -43,7 +43,6 @@ def _create_handler() -> logging.StreamHandler:
 
 def _create_tui_handler() -> logging.Handler:
     """Create a TextualHandler that routes logs through Textual instead of stderr."""
-    from textual.logging import TextualHandler
     handler = TextualHandler()
     handler.setFormatter(logging.Formatter(
         fmt=Config.LOG_FORMAT,
diff --git a/bluebox/utils/tui_base.py b/bluebox/utils/tui_base.py
index 10929b4e..3709fbee 100644
--- a/bluebox/utils/tui_base.py
+++ b/bluebox/utils/tui_base.py
@@ -308,6 +308,20 @@ def _format_user_label(self) -> str:
         """Return Rich markup for the user-message prefix."""
         return "[bold green]You>[/bold green]"
 
+    def _print_commands(self) -> None:
+        """Print available slash commands to the chat log on boot."""
+        chat = self.query_one("#chat-log", RichLog)
+        # Skip aliases and duplicates to keep it concise
+        skip = {"/exit", "/q", "/commands", "/?", "/h"}
+        cmds = {k: v for k, v in self.SLASH_COMMANDS.items() if k not in skip}
+        if not cmds:
+            return
+        parts = [f"  [cyan]{cmd}[/cyan] [dim]{desc}[/dim]" for cmd, desc in cmds.items()]
+        chat.write(Text.from_markup(
+            "[bold]Commands:[/bold]\n" + "\n".join(parts)
+        ))
+        chat.write("")
+
     def _add_saved_file(self, filepath: str) -> None:
         """Write a timestamped entry to the saved-files pane."""
         if filepath in self._saved_file_set:
@@ -416,6 +430,7 @@ def on_mount(self) -> None:
             return
 
         self._print_welcome()
+        self._print_commands()
         self.call_after_refresh(self._update_status)  # defer until layout is done
         self.set_interval(10, self._update_status)
         self.query_one("#user-input", Input).focus()
diff --git a/bluebox/workspace/__init__.py b/bluebox/workspace/__init__.py
new file mode 100644
index 00000000..33eda6d3
--- /dev/null
+++ b/bluebox/workspace/__init__.py
@@ -0,0 +1,11 @@
+"""
+Workspace interfaces and implementations.
+"""
+
+from bluebox.workspace.abstract_workspace import AgentWorkspace
+from bluebox.workspace.local_workspace import LocalAgentWorkspace
+
+__all__ = [
+    "AgentWorkspace",
+    "LocalAgentWorkspace",
+]
diff --git a/bluebox/workspace/abstract_workspace.py b/bluebox/workspace/abstract_workspace.py
new file mode 100644
index 00000000..eed19900
--- /dev/null
+++ b/bluebox/workspace/abstract_workspace.py
@@ -0,0 +1,119 @@
+"""
+bluebox/workspace/abstract_workspace.py
+
+Abstract workspace interface for agent file operations.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any
+
+from bluebox.data_models.agents.workspace import (
+    ArtifactRef,
+    ArtifactSource,
+    MountedInputRef,
+    WorkspaceDelta,
+    WorkspaceSnapshot,
+)
+
+
+class AgentWorkspace(ABC):
+    @property
+    @abstractmethod
+    def root_path(self) -> Path:
+        pass
+
+    @abstractmethod
+    def save_artifact(
+        self,
+        source: ArtifactSource,
+        filename: str,
+        content: str | bytes,
+        *,
+        tool_name: str | None = None,
+        code_run_id: str | None = None,
+        content_type: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> ArtifactRef:
+        pass
+
+    @abstractmethod
+    def list_artifacts(self, source: ArtifactSource | None = None) -> list[ArtifactRef]:
+        pass
+
+    @abstractmethod
+    def read_artifact(
+        self,
+        artifact_id: str,
+        start_line: int | None = None,
+        end_line: int | None = None,
+    ) -> dict[str, Any]:
+        pass
+
+    @abstractmethod
+    def attach_input_file(self, name: str, source_path: str | Path) -> MountedInputRef:
+        """
+        Attach an external input file into workspace raw/ via hardlink.
+
+        Args:
+            name: Logical input name, used for target filename.
+            source_path: Source file path on host filesystem.
+
+        Returns:
+            MountedInputRef describing the attached file.
+        """
+        pass
+
+    @abstractmethod
+    def list_mounted_inputs(self) -> list[MountedInputRef]:
+        """List mounted input files from the input-mount manifest."""
+        pass
+
+    @abstractmethod
+    def snapshot_paths(self, roots: list[str]) -> WorkspaceSnapshot:
+        pass
+
+    @abstractmethod
+    def diff_snapshot(
+        self,
+        before: WorkspaceSnapshot,
+        after: WorkspaceSnapshot,
+    ) -> WorkspaceDelta:
+        pass
+
+    @abstractmethod
+    def read_file(
+        self,
+        path: str,
+        start_line: int | None = None,
+        end_line: int | None = None,
+    ) -> dict[str, Any]:
+        pass
+
+    @abstractmethod
+    def list_files(self) -> dict[str, Any]:
+        pass
+
+    @abstractmethod
+    def generate_summary(
+        self,
+        max_artifacts: int = 10,
+        max_summary_chars: int = 160,
+    ) -> str:
+        """
+        Return a compact, prompt-ready summary of workspace state.
+
+        This is intended for system prompt injection, so it should stay concise
+        and focus on artifact inventory and recent output/context files.
+        """
+        pass
+
+    @abstractmethod
+    def ensure_dirs(self) -> None:
+        pass
+
+    @abstractmethod
+    def cleanup(self, remove_root: bool = False) -> None:
+        pass
diff --git a/bluebox/workspace/local_workspace.py b/bluebox/workspace/local_workspace.py
new file mode 100644
index 00000000..6ab14f72
--- /dev/null
+++ b/bluebox/workspace/local_workspace.py
@@ -0,0 +1,468 @@
+"""
+bluebox/workspace/local_workspace.py
+
+Local filesystem implementation of AgentWorkspace.
+"""
+
+from __future__ import annotations
+
+import errno
+import hashlib
+import os
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+from bluebox.data_models.agents.workspace import (
+    ArtifactManifestEntry,
+    ArtifactRef,
+    ArtifactSource,
+    MountedInputRef,
+    WorkspaceDelta,
+    WorkspaceFileState,
+    WorkspaceSnapshot,
+)
+from bluebox.utils.infra_utils import read_file_lines
+from bluebox.utils.logger import get_logger
+from bluebox.workspace.abstract_workspace import AgentWorkspace
+
+logger = get_logger(name=__name__)
+
+
+class LocalAgentWorkspace(AgentWorkspace):
+    def __init__(
+        self,
+        workspace_dir: str,
+        *,
+        agent_id: str | None = None,
+        thread_id: str | None = None,
+    ) -> None:
+        _ = agent_id
+        _ = thread_id
+        self._workspace_dir = Path(workspace_dir)
+        self._scratch_dir = self._workspace_dir / "scratch"
+        self._raw_dir = self._workspace_dir / "raw"
+        self._output_dir = self._workspace_dir / "output"
+        self._context_dir = self._workspace_dir / "context"
+        self._meta_dir = self._workspace_dir / "meta"
+
+        self._manifest_path = self._meta_dir / "manifest.jsonl"
+        self._input_mounts_manifest_path = self._meta_dir / "input_mounts.jsonl"
+
+        self.ensure_dirs()
+        self._artifact_index = self._load_last_index_from_manifest()
+
+    @classmethod
+    def from_directory_path(
+        cls,
+        directory_path: str | Path,
+        *,
+        agent_id: str | None = None,
+        thread_id: str | None = None,
+    ) -> LocalAgentWorkspace:
+        """
+        Construct a workspace from an existing directory path.
+
+        Useful for resume flows where the caller already has a concrete workspace
+        directory (e.g., persisted from a previous agent run).
+        """
+        return cls(
+            workspace_dir=str(directory_path),
+            agent_id=agent_id,
+            thread_id=thread_id,
+        )
+
+    @property
+    def root_path(self) -> Path:
+        return self._workspace_dir
+
+    def ensure_dirs(self) -> None:
+        self._scratch_dir.mkdir(parents=True, exist_ok=True)
+        self._raw_dir.mkdir(parents=True, exist_ok=True)
+        self._output_dir.mkdir(parents=True, exist_ok=True)
+        self._context_dir.mkdir(parents=True, exist_ok=True)
+        self._meta_dir.mkdir(parents=True, exist_ok=True)
+        self._manifest_path.touch(exist_ok=True)
+        self._input_mounts_manifest_path.touch(exist_ok=True)
+
+    def cleanup(self, remove_root: bool = False) -> None:
+        if remove_root and self._workspace_dir.exists():
+            for p in sorted(self._workspace_dir.rglob("*"), reverse=True):
+                if p.is_file():
+                    p.unlink(missing_ok=True)
+                elif p.is_dir():
+                    try:
+                        p.rmdir()
+                    except OSError:
+                        pass
+            try:
+                self._workspace_dir.rmdir()
+            except OSError:
+                pass
+
+    def _load_last_index_from_manifest(self) -> int:
+        if not self._manifest_path.exists():
+            return 0
+
+        max_index = 0
+        for line in self._manifest_path.read_text().splitlines():
+            if not line.strip():
+                continue
+            try:
+                entry = ArtifactManifestEntry.model_validate_json(line)
+                if entry.index > max_index:
+                    max_index = entry.index
+            except Exception as e:
+                logger.warning("Bad manifest entry skipped when loading index: %s", e)
+        return max_index
+
+    def _next_index(self) -> int:
+        self._artifact_index += 1
+        return self._artifact_index
+
+    def _dir_for_source(self, source: ArtifactSource) -> Path:
+        if source == "raw":
+            return self._raw_dir
+        if source == "output":
+            return self._output_dir
+        return self._context_dir
+
+    def _coerce_bytes(self, content: str | bytes) -> tuple[bytes, str]:
+        if isinstance(content, bytes):
+            return content, "binary"
+        return content.encode("utf-8"), "text"
+
+    def _infer_content_type(self, filename: str, fallback: str) -> str:
+        ext = Path(filename).suffix.lower()
+        mapping = {
+            ".json": "json",
+            ".txt": "text",
+            ".md": "markdown",
+            ".csv": "csv",
+            ".html": "html",
+            ".htm": "html",
+        }
+        return mapping.get(ext, fallback)
+
+    def _make_summary(self, content: str | bytes, max_chars: int = 300) -> str:
+        if isinstance(content, bytes):
+            return f"<binary {len(content)} bytes>"
+        c = content.strip()
+        return c[:max_chars] + ("..." if len(c) > max_chars else "")
+
+    def _sha256(self, data: bytes) -> str:
+        return hashlib.sha256(data).hexdigest()
+
+    def _dedupe_filename(self, directory: Path, filename: str, index: int) -> str:
+        candidate = directory / filename
+        if not candidate.exists():
+            return filename
+        stem = candidate.stem
+        suffix = candidate.suffix
+        return f"{stem}-{index}{suffix}"
+
+    def _validate_artifact_filename(self, filename: str) -> str:
+        normalized_filename = filename.replace("\\", "/")
+        path = Path(normalized_filename)
+        if (
+            not normalized_filename
+            or normalized_filename in {".", ".."}
+            or "/" in normalized_filename
+            or path.name != normalized_filename
+        ):
+            raise ValueError(
+                f"Invalid filename '{filename}'. Filenames must not include path separators.",
+            )
+        return normalized_filename
+
+    def _validate_mount_name(self, name: str) -> str:
+        normalized_name = name.replace("\\", "/")
+        path = Path(normalized_name)
+        if (
+            not normalized_name
+            or normalized_name in {".", ".."}
+            or "/" in normalized_name
+            or path.name != normalized_name
+        ):
+            raise ValueError(
+                f"Invalid mount name '{name}'. Names must not include path separators.",
+            )
+        return normalized_name
+
+    def save_artifact(
+        self,
+        source: ArtifactSource,
+        filename: str,
+        content: str | bytes,
+        *,
+        tool_name: str | None = None,
+        code_run_id: str | None = None,
+        content_type: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> ArtifactRef:
+        safe_input_filename = self._validate_artifact_filename(filename)
+        directory = self._dir_for_source(source)
+        index = self._next_index()
+        artifact_id = f"a_{index:06d}"
+
+        safe_filename = self._dedupe_filename(directory, safe_input_filename, index)
+        path = directory / safe_filename
+
+        raw_bytes, fallback_ct = self._coerce_bytes(content)
+        path.write_bytes(raw_bytes)
+
+        rel = str(path.relative_to(self._workspace_dir))
+        ct = content_type or self._infer_content_type(safe_filename, fallback_ct)
+        created_at = datetime.now(timezone.utc).isoformat()
+
+        ref = ArtifactRef(
+            artifact_id=artifact_id,
+            index=index,
+            source=source,
+            relative_path=rel,
+            size_bytes=len(raw_bytes),
+            content_type=ct,
+            summary=self._make_summary(content),
+            created_at=created_at,
+            sha256=self._sha256(raw_bytes),
+            metadata=metadata or {},
+        )
+        entry = ArtifactManifestEntry(
+            index=index,
+            artifact=ref,
+            tool_name=tool_name,
+            code_run_id=code_run_id,
+        )
+        with self._manifest_path.open("a", encoding="utf-8") as f:
+            f.write(entry.model_dump_json())
+            f.write("\n")
+
+        logger.info("Saved artifact %s -> %s", artifact_id, path)
+        return ref
+
+    def _iter_manifest(self) -> list[ArtifactManifestEntry]:
+        if not self._manifest_path.exists():
+            return []
+        entries: list[ArtifactManifestEntry] = []
+        for line in self._manifest_path.read_text().splitlines():
+            if not line.strip():
+                continue
+            try:
+                entries.append(ArtifactManifestEntry.model_validate_json(line))
+            except Exception as e:
+                logger.warning("Bad manifest entry skipped: %s", e)
+        return entries
+
+    def _iter_input_mount_manifest(self) -> list[MountedInputRef]:
+        if not self._input_mounts_manifest_path.exists():
+            return []
+        entries: list[MountedInputRef] = []
+        for line in self._input_mounts_manifest_path.read_text().splitlines():
+            if not line.strip():
+                continue
+            try:
+                entries.append(MountedInputRef.model_validate_json(line))
+            except Exception as e:
+                logger.warning("Bad input mount entry skipped: %s", e)
+        return entries
+
+    def list_artifacts(self, source: ArtifactSource | None = None) -> list[ArtifactRef]:
+        refs = [e.artifact for e in self._iter_manifest()]
+        if source is None:
+            return refs
+        return [r for r in refs if r.source == source]
+
+    def read_artifact(
+        self,
+        artifact_id: str,
+        start_line: int | None = None,
+        end_line: int | None = None,
+    ) -> dict[str, Any]:
+        for ref in self.list_artifacts():
+            if ref.artifact_id == artifact_id:
+                return self.read_file(ref.relative_path, start_line=start_line, end_line=end_line)
+        return {"error": f"Artifact not found: {artifact_id}"}
+
+    def attach_input_file(self, name: str, source_path: str | Path) -> MountedInputRef:
+        self.ensure_dirs()
+
+        safe_name = self._validate_mount_name(name)
+        source = Path(source_path).expanduser().resolve(strict=True)
+        if not source.is_file():
+            raise ValueError(f"Input source is not a file: {source}")
+
+        suffix = source.suffix if source.suffix else ""
+        filename = safe_name if Path(safe_name).suffix else f"{safe_name}{suffix}"
+        safe_filename = self._validate_artifact_filename(filename)
+        target = self._raw_dir / safe_filename
+
+        try:
+            os.link(source, target)
+        except FileExistsError:
+            source_stat = source.stat()
+            target_stat = target.stat()
+            if source_stat.st_ino != target_stat.st_ino or source_stat.st_dev != target_stat.st_dev:
+                raise ValueError(
+                    f"Input target already exists with different inode: {target}",
+                ) from None
+        except OSError as e:
+            if e.errno == errno.EXDEV:
+                raise ValueError(
+                    f"Cannot hardlink across filesystems: source={source} target={target}",
+                ) from e
+            raise
+
+        st = target.stat()
+        ref = MountedInputRef(
+            mount_id=f"m_{uuid.uuid4().hex}",
+            name=safe_name,
+            source_path=str(source),
+            relative_path=str(target.relative_to(self._workspace_dir)),
+            mode="hardlink",
+            size_bytes=st.st_size,
+            mtime_ns=st.st_mtime_ns,
+            created_at=datetime.now(timezone.utc).isoformat(),
+            metadata={},
+        )
+        with self._input_mounts_manifest_path.open("a", encoding="utf-8") as f:
+            f.write(ref.model_dump_json())
+            f.write("\n")
+
+        logger.info("Attached input file %s -> %s", source, target)
+        return ref
+
+    def list_mounted_inputs(self) -> list[MountedInputRef]:
+        return self._iter_input_mount_manifest()
+
+    def read_file(
+        self,
+        path: str,
+        start_line: int | None = None,
+        end_line: int | None = None,
+    ) -> dict[str, Any]:
+        resolved = (self._workspace_dir / path).resolve()
+        workspace_resolved = self._workspace_dir.resolve()
+        try:
+            resolved.relative_to(workspace_resolved)
+        except ValueError:
+            return {"error": f"Access denied: '{path}' is outside the workspace directory"}
+
+        result = read_file_lines(resolved, start_line=start_line, end_line=end_line)
+        result["path"] = path
+        return result
+
+    def list_files(self) -> dict[str, Any]:
+        self._workspace_dir.mkdir(parents=True, exist_ok=True)
+        tree_lines: list[str] = []
+        total_files = 0
+
+        for dirpath, dirnames, filenames in sorted(self._workspace_dir.walk()):
+            rel_dir = dirpath.relative_to(self._workspace_dir)
+            depth = len(rel_dir.parts)
+            indent = "  " * depth
+            dir_name = rel_dir.name or str(self._workspace_dir.name)
+            tree_lines.append(f"{indent}{dir_name}/")
+
+            dirnames.sort()
+            for filename in sorted(filenames):
+                filepath = dirpath / filename
+                size = filepath.stat().st_size
+                if size < 1024:
+                    size_str = f"{size}B"
+                elif size < 1024 * 1024:
+                    size_str = f"{size / 1024:.1f}KB"
+                else:
+                    size_str = f"{size / (1024 * 1024):.1f}MB"
+                tree_lines.append(f"{indent}  {filename}  ({size_str})")
+                total_files += 1
+
+        return {"tree": "\n".join(tree_lines), "total_files": total_files}
+
+    def generate_summary(
+        self,
+        max_artifacts: int = 10,
+        max_summary_chars: int = 160,
+    ) -> str:
+        refs = self.list_artifacts()
+
+        raw_count = sum(1 for r in refs if r.source == "raw")
+        output_count = sum(1 for r in refs if r.source == "output")
+        context_count = sum(1 for r in refs if r.source == "context")
+
+        max_artifacts = max(0, int(max_artifacts))
+        max_summary_chars = max(20, int(max_summary_chars))
+
+        lines: list[str] = [
+            "## Workspace State",
+            f"- Root: {self._workspace_dir}",
+            (
+                f"- Artifacts: {len(refs)} total "
+                f"(raw: {raw_count}, output: {output_count}, context: {context_count})"
+            ),
+        ]
+
+        if not refs:
+            lines.append("- Recent artifacts: none")
+            return "\n".join(lines)
+
+        refs_sorted = sorted(refs, key=lambda r: r.index, reverse=True)
+        recent = refs_sorted[:max_artifacts]
+
+        lines.append("- Recent artifacts (newest first):")
+        for r in recent:
+            summary = (r.summary or "").replace("\n", " ").strip()
+            if len(summary) > max_summary_chars:
+                summary = summary[:max_summary_chars] + "..."
+            if not summary:
+                summary = "(no summary)"
+
+            lines.append(
+                f"  - {r.artifact_id} [{r.source}] {r.relative_path} "
+                f"({r.size_bytes} bytes) :: {summary}"
+            )
+
+        if len(refs_sorted) > len(recent):
+            lines.append(f"- ... and {len(refs_sorted) - len(recent)} more artifact(s)")
+
+        return "\n".join(lines)
+
+    def snapshot_paths(self, roots: list[str]) -> WorkspaceSnapshot:
+        out: dict[str, WorkspaceFileState] = {}
+        for root in roots:
+            base = self._workspace_dir / root
+            if not base.exists():
+                continue
+            for p in base.rglob("*"):
+                if not p.is_file():
+                    continue
+                rel = str(p.relative_to(self._workspace_dir))
+                st = p.stat()
+                out[rel] = WorkspaceFileState(
+                    relative_path=rel,
+                    size_bytes=st.st_size,
+                    mtime_ns=st.st_mtime_ns,
+                )
+        return WorkspaceSnapshot(roots=roots, files=out)
+
+    def diff_snapshot(
+        self,
+        before: WorkspaceSnapshot,
+        after: WorkspaceSnapshot,
+    ) -> WorkspaceDelta:
+        created: list[WorkspaceFileState] = []
+        modified: list[WorkspaceFileState] = []
+        deleted: list[str] = []
+
+        for rel, state in after.files.items():
+            prev = before.files.get(rel)
+            if prev is None:
+                created.append(state)
+            elif prev.size_bytes != state.size_bytes or prev.mtime_ns != state.mtime_ns:
+                modified.append(state)
+
+        for rel in before.files:
+            if rel not in after.files:
+                deleted.append(rel)
+
+        return WorkspaceDelta(created=created, modified=modified, deleted=deleted)
diff --git a/docs/api_indexing.md b/docs/api_indexing.md
new file mode 100644
index 00000000..44211c71
--- /dev/null
+++ b/docs/api_indexing.md
@@ -0,0 +1,615 @@
+# API Indexing
+
+This document describes the current API indexing system as implemented in:
+
+- `bluebox/scripts/api_indexing/run_api_indexing.py`
+- `bluebox/agents/principal_investigator.py`
+- `bluebox/agents/workers/experiment_worker.py`
+- `bluebox/agents/routine_inspector.py`
+
+## General Overview (Capture + Monitoring First)
+
+API indexing starts with browser monitoring and asset collection, then runs multi-agent synthesis on captured artifacts. This mirrors the monitor-first approach in `docs/routine_discovery.md`.
+
+Capture command:
+
+```bash
+bluebox-monitor --host 127.0.0.1 --port 9222 --output-dir ./cdp_captures --url about:blank --incognito
+```
+
+Expected capture tree:
+
+```text
+cdp_captures/
+├── session_summary.json
+├── network/
+│   ├── events.jsonl
+│   └── javascript_events.jsonl
+├── storage/
+│   └── events.jsonl
+├── dom/
+│   └── events.jsonl
+├── interaction/
+│   └── events.jsonl
+└── window_properties/
+    └── events.jsonl
+```
+
+Main pipeline command:
+
+```bash
+bluebox-api-index \
+  --cdp-captures-dir ./cdp_captures \
+  --task "Recover and validate routines from this captured session"
+```
+
+## Agents Involved
+
+### Orchestration Boundary (Important)
+
+Phase 2 has exactly one orchestrator:
+
+- `PrincipalInvestigator` (PI)
+
+`ExperimentWorker` and `RoutineInspector` are execution subagents. They do not orchestrate global flow, do not own catalog lifecycle, and do not decide completion.
+
+### Shared Base Tool Families (from `AbstractAgent`)
+
+Any agent may get these, depending on constructor flags/context:
+
+- `list_files` — List files in workspace or docs.
+- `read_file` — Read file content by path.
+- `search_files` — Search text across files.
+- `execute_python` — Run sandboxed Python analysis code.
+- `add_note` — Attach note to finalized wrapper.
+- `finalize_with_output` — Finalize with schema-validated output object.
+- `finalize_with_failure` — Finalize schema task with failure reason.
+- `finalize_result` — Finalize with freeform output object.
+- `finalize_failure` — Finalize freeform task with failure reason.
+
+Prompt plumbing automatically injected by base class:
+
+- `## Tools` (available tools list at runtime)
+- workspace usage guidance (if workspace exists)
+- documentation index section (if docs loader exists)
+
+### Agent: `NetworkSpecialist` (Phase 1)
+
+Role:
+
+- Network-domain exploration and endpoint triage.
+
+Custom tools:
+
+- `search_responses_by_terms` — Rank responses by term relevance.
+- `get_entry_detail` — Return full request and response details.
+- `get_response_body_schema` — Infer JSON response structure and types.
+- `get_unique_urls` — List unique URLs with request counts.
+- `search_requests_by_terms` — Rank requests by matching request terms.
+- `search_response_bodies` — Find value matches in response bodies.
+
+Inherited tools in current pipeline runs:
+
+- `execute_python` — Run sandboxed Python over capture context.
+- `list_files` — List workspace or docs files.
+- `read_file` — Read file content by path.
+- `search_files` — Search text in files.
+- `add_note` — Attach note before finalization.
+- `finalize_with_output` — Submit schema-validated final output.
+- `finalize_with_failure` — Submit schema-task failure reason.
+
+Data access:
+
+- `NetworkDataLoader` (`network/events.jsonl`)
+- mounted workspace input (typically `raw/network_events.jsonl`)
+
+Prompt sections:
+
+- `SYSTEM_PROMPT` / `AUTONOMOUS_SYSTEM_PROMPT` mission blocks
+- `## Network Traffic Context`
+- `## Likely Important API Endpoints` / `## Likely API Endpoints`
+- `## Host Statistics`
+- output schema + urgency in autonomous mode
+
+### Agent: `ValueTraceResolverSpecialist` (Phase 1 Storage/Window)
+
+Role:
+
+- Trace token/value origins across network, storage, and window properties.
+
+Custom tools:
+
+- `search_everywhere` — Search value across all available sources.
+- `search_in_network` — Search value inside network responses.
+- `search_in_storage` — Search value in storage events.
+- `search_in_window_props` — Search value in window property history.
+- `get_network_entry` — Return full network transaction by ID.
+- `get_storage_entry` — Return one storage event by index.
+- `get_window_prop_changes` — Return changes for one window path.
+- `get_storage_by_key` — Return events for specific storage key.
+
+Inherited tools in current pipeline runs:
+
+- `list_files` — List workspace or docs files.
+- `read_file` — Read file content by path.
+- `search_files` — Search text in files.
+- `add_note` — Attach note before finalization.
+- `finalize_with_output` — Submit schema-validated final output.
+- `finalize_with_failure` — Submit schema-task failure reason.
+- `execute_python` — Configurable; disabled in storage exploration runner.
+
+Data access:
+
+- `StorageDataLoader` (`storage/events.jsonl`) when present
+- `WindowPropertyDataLoader` (`window_properties/events.jsonl`) when present
+- optional `NetworkDataLoader` for cross-reference
+- mounted workspace inputs for available raw captures
+
+Prompt sections:
+
+- `SYSTEM_PROMPT` / `AUTONOMOUS_SYSTEM_PROMPT` mission blocks
+- `## Data Store Context` with per-source availability/stats
+- output schema + urgency in autonomous mode
+
+### Agent: `DOMSpecialist` (Phase 1)
+
+Role:
+
+- DOM structure exploration (pages/forms/elements/scripts/tables).
+
+Custom tools:
+
+- `list_pages` — List captured pages with metadata.
+- `get_elements` — Return typed DOM elements from snapshots.
+- `get_forms` — Return forms with fields and actions.
+- `get_tables` — Return tables with headers and counts.
+- `get_scripts` — Return script tags and inline blobs.
+- `get_text_content` — Return extracted text from snapshot.
+- `search_strings` — Search text across DOM snapshots.
+- `get_snapshot_diff` — Compare two snapshots for structural changes.
+- `get_navigation_sequence` — Return ordered page transition sequence.
+
+Inherited tools in current pipeline runs:
+
+- `list_files` — List workspace or docs files.
+- `read_file` — Read file content by path.
+- `search_files` — Search text in files.
+- `add_note` — Attach note before finalization.
+- `finalize_with_output` — Submit schema-validated final output.
+- `finalize_with_failure` — Submit schema-task failure reason.
+- `execute_python` — Not enabled for this agent.
+
+Data access:
+
+- `DOMDataLoader` (`dom/events.jsonl`)
+- mounted workspace input (`raw/dom_events.jsonl`)
+
+Prompt sections:
+
+- `SYSTEM_PROMPT` / `AUTONOMOUS_SYSTEM_PROMPT` mission blocks
+- `## DOM Data Context`
+- output schema + urgency in autonomous mode
+
+### Agent: `InteractionSpecialist` (Phase 1)
+
+Role:
+
+- Interaction exploration and user-intent reconstruction.
+
+Custom tools:
+
+- `get_interaction_summary` — Summarize interaction event counts and types.
+- `search_interactions_by_type` — Filter events by interaction type.
+- `search_interactions_by_element` — Filter events by element attributes.
+- `get_interaction_detail` — Return one interaction event details.
+- `get_form_inputs` — Extract typed and selected form values.
+- `get_unique_elements` — List unique interacted elements.
+- `list_pages` — List pages for interaction context.
+- `get_inputs` — Return input elements from DOM snapshots.
+- `get_buttons` — Return button elements from DOM snapshots.
+- `get_links` — Return link elements from DOM snapshots.
+- `get_forms` — Return form elements from DOM snapshots.
+- `get_tables` — Return table elements from DOM snapshots.
+- `get_headings` — Return heading structure from DOM snapshots.
+- `get_navigation_sequence` — Return captured page navigation sequence.
+- `search_strings` — Search DOM strings for supporting clues.
+
+Inherited tools in current pipeline runs:
+
+- `list_files` — List workspace or docs files.
+- `read_file` — Read file content by path.
+- `search_files` — Search text in files.
+- `add_note` — Attach note before finalization.
+- `finalize_with_output` — Submit schema-validated final output.
+- `finalize_with_failure` — Submit schema-task failure reason.
+- `execute_python` — Not enabled for this agent.
+
+Data access:
+
+- `InteractionsDataLoader` (`interaction/events.jsonl`)
+- optional `DOMDataLoader`
+- mounted workspace inputs for available raw captures
+
+Prompt sections:
+
+- `SYSTEM_PROMPT` / `AUTONOMOUS_SYSTEM_PROMPT` mission blocks
+- `## Interaction Data Context`
+- optional `## DOM Data Context`
+- output schema + urgency in autonomous mode
+
+### Agent: `PrincipalInvestigator` (Phase 2 Orchestrator)
+
+Role:
+
+- Only orchestrator for Phase 2.
+- Owns planning, experiment dispatch, attempt lifecycle, shipping, and completion.
+
+Custom orchestration tools:
+
+- `plan_routines` — Define routine specs to build.
+- `set_active_routine` — Switch PI focus to one routine.
+- `dispatch_experiment` — Run one worker experiment task.
+- `dispatch_experiments_batch` — Run multiple worker experiments in parallel.
+- `get_experiment_result` — Fetch a completed experiment output.
+- `record_finding` — Store verdict summary and takeaways.
+- `record_proven_artifact` — Save validated fetch/token/navigation evidence.
+- `submit_routine` — Validate, execute, and inspect routine attempt.
+- `mark_routine_shipped` — Mark passing attempt as shipped.
+- `mark_routine_failed` — Mark routine failed with guardrails.
+- `get_ledger` — Return current ledger state summary.
+- `mark_complete` — Finalize catalog when all specs addressed.
+- `mark_failed` — Fail entire pipeline when unrecoverable.
+
+Inherited tools:
+
+- `execute_python` — Run sandboxed Python for PI analysis.
+- `list_files` — List workspace or docs files.
+- `read_file` — Read docs or workspace file.
+- `search_files` — Search docs or workspace text.
+- no autonomous finalize tools (PI is conversational loop, `self.run` is the equivalent of autonomous mode)
+
+Data access:
+
+- exploration summaries (stored raw + toonified for prompt)
+- `DiscoveryLedger` (global state)
+- routine schema/docs index via `DocumentationDataLoader`
+- workspace-mounted raw capture files under `agent_workspaces/PI/raw`
+- orchestration runtime state (`Task` queue, subagent pool)
+
+Prompt sections (PI system prompt build):
+
+- `SYSTEM_PROMPT_CORE` (role, strategy, auth ordering, quality gates)
+- optional `## Workspace Summary`
+- `## Routine JSON Schema`
+- routine JSON example block
+- `## Worker Capabilities`
+- code execution environment section
+- `## Exploration Summaries`
+- `## Discovery Ledger` (toon-encoded payload)
+- `## Task Queue` (when non-empty)
+- base-injected sections: tools list, workspace usage, docs index
+
+### Agent: `ExperimentWorker` (Phase 2 Subagent)
+
+Role:
+
+- Execute PI-dispatched experiments using live browser + recorded capture context.
+
+Custom tools:
+
+- Live browser:
+  - `browser_navigate` — Navigate tab and wait for load.
+  - `browser_eval_js` — Execute JavaScript in live page.
+  - `browser_cdp_command` — Send raw CDP method call.
+  - `browser_get_dom` — Return filtered live DOM tree.
+- Recorded lookups:
+  - `search_recorded_transactions` — Search captured transactions by keywords.
+  - `get_recorded_transaction` — Return captured transaction full details.
+  - `search_recorded_storage` — Search captured storage events by value.
+  - `trace_recorded_value` — Trace value across all recorded domains.
+  - `get_recorded_dom_snapshot` — Return one captured DOM snapshot summary.
+  - `get_recorded_dom_elements` — Return captured elements by type.
+
+Inherited tools in current pipeline runs:
+
+- `execute_python` — Run sandboxed Python during experiments.
+- `list_files` — List workspace or docs files.
+- `read_file` — Read file content by path.
+- `search_files` — Search text in files.
+- `add_note` — Attach note before finalization.
+- `finalize_with_output` — Submit schema-validated final output.
+- `finalize_with_failure` — Submit schema-task failure reason.
+
+Data access:
+
+- live browser tab (lazy-created, persistent per worker instance)
+- network/storage/dom/window loaders when provided
+- mounted raw capture files under worker workspace `raw/`
+
+Prompt sections:
+
+- conversational: `SYSTEM_PROMPT` + optional `## Workspace Summary` + `## Available Data Sources` + code execution section
+- autonomous: `AUTONOMOUS_SYSTEM_PROMPT` + optional `## Workspace Summary` + `## Available Data Sources` + code execution + output schema + urgency
+
+### Agent: `RoutineInspector` (Phase 2 Subagent)
+
+Role:
+
+- Independent quality gate for submitted routine attempts.
+
+Custom tools:
+
+- none (no inspector-specific tool handlers; relies on inherited tools)
+
+Inherited tools in current pipeline runs:
+
+- `execute_python` — Run sandboxed Python for inspection analysis.
+- `list_files` — List workspace or docs files.
+- `read_file` — Read docs or workspace file.
+- `search_files` — Search docs or workspace text.
+- `add_note` — Attach note before finalization.
+- `finalize_with_output` — Submit schema-validated inspection result.
+- `finalize_with_failure` — Submit inspection failure reason.
+
+Data access:
+
+- inspection prompt includes routine name/description/JSON + execution payload
+- exploration summaries appended when available
+- if execution payload is large, PI persists it to inspector workspace `raw/*.json` and inspector reads from file
+- docs loader is supplied by PI, so docs tools are available
+
+Prompt sections:
+
+- `AUTONOMOUS_SYSTEM_PROMPT` rubric sections:
+  - critical failure signals
+  - spec downgrade detection
+  - 6-dimension scoring rubric
+  - verdict rules
+  - documentation-backed recommendations
+  - process checklist
+- then base adds: output schema section, code execution section, docs index section, urgency section
+
+## Data Models Involved
+
+### Exploration Models (`bluebox/data_models/api_indexing/exploration.py`)
+
+- `NetworkExplorationSummary`
+- `StorageExplorationSummary`
+- `DOMExplorationSummary`
+- `UIExplorationSummary`
+- supporting types: `EndpointCluster`, `EndpointCategory`, `InterestLevel`
+
+### Core Orchestration Models (`bluebox/data_models/orchestration/*`)
+
+`DiscoveryLedger` (`ledger.py`) is the PI’s source of truth:
+
+- context: `user_task`
+- plan state: `routine_specs`, `active_spec_id`
+- experiment state: `experiments`, `proven`, `unresolved`
+- attempt state: `attempts`
+- terminal artifact: `catalog`
+
+Routine/catalog lifecycle models:
+
+- `RoutineSpec`, `RoutineSpecStatus`
+- `RoutineAttempt`, `RoutineAttemptStatus`
+- `RoutineCatalog`, `ShippedRoutine`
+
+`RoutineCatalog` (final deliverable) key fields:
+
+- `site`
+- `user_task`
+- `routines` (`ShippedRoutine[]`)
+- `usage_guide`
+- `failed_routines`
+- `total_experiments`
+- `total_attempts`
+
+Experiment models:
+
+- `ExperimentEntry`, `ExperimentStatus`, `ExperimentVerdict`
+- `ExperimentTakeaway`
+- `ProvenArtifacts`
+
+Task/subagent models:
+
+- `Task`, `TaskStatus`, `SpecialistAgentType`
+- `SubAgent`
+
+Inspection/result wrapper models:
+
+- `RoutineInspectionResult`
+- `SpecialistResultWrapper`
+
+### Routine and Execution Models
+
+- `Routine` (`bluebox/data_models/routine/routine.py`)
+- `RoutineExecutionResultWithMetadata` (`bluebox/data_models/routine/execution.py`)
+
+## Phase Overview
+
+### Phase 0: Monitor and Capture Assets
+
+1. Run browser monitor.
+2. Produce domain JSONL artifacts under `cdp_captures/`.
+
+### Phase 1: Parallel Exploration Specialists
+
+1. Run network/storage/dom/ui exploration specialists in parallel.
+2. Each returns validated structured summary.
+3. Persist to `output_dir/exploration/{network,storage,dom,ui}.json`.
+
+### Phase 2: PI-Led Orchestration
+
+1. PI ingests exploration summaries and docs context.
+2. PI plans routine specs, dispatches worker experiments, records findings.
+3. PI submits routines for execute+inspect and ships/fails specs.
+4. PI calls `mark_complete` to build final catalog.
+
+### Phase 3: Durable Output and Analysis
+
+Persisted outputs:
+
+- `ledger.json`, `catalog.json`
+- `experiments/`, `attempts/`, `attempt_records/`, `routines/`
+- `agent_threads/` (PI/worker/inspector transcripts)
+- `agent_workspaces/PI`, `agent_workspaces/worker_*`, `agent_workspaces/inspector_*`
+
+Optional analysis:
+
+- `analysis.txt` via `--post-run-analysis`
+
+## Deep Dive: Phase 2 (PI Orchestration Internals)
+
+### 1) PI Bootstraps State
+
+- Load/initialize `DiscoveryLedger`.
+- Build system prompt with exploration summaries + docs + ledger snapshot.
+- If resuming after failure/context exhaustion, PI starts from preserved ledger.
+
+### 2) PI Plans and Delegates
+
+- Uses `plan_routines` to define catalog specs.
+- Uses `dispatch_experiments_batch` (preferred) to create `ExperimentEntry` + `Task` pairs and run workers in parallel.
+- Worker results are attached back to experiment outputs.
+
+### 3) PI Consolidates Evidence
+
+- `record_finding` adds verdict + summary + reusable takeaways.
+- `record_proven_artifact` updates `ProvenArtifacts` for fetch/navigation/token/parameter evidence.
+
+### 4) PI Builds and Submits Attempts
+
+`submit_routine` pipeline:
+
+1. documentation-quality gate
+2. schema validation against `Routine`
+3. live execution with `RoutineExecutionResultWithMetadata`
+4. independent inspection via `RoutineInspector` (`RoutineInspectionResult`)
+5. write `RoutineAttempt` with verdict and remediation data
+
+Large execution payload handling:
+
+- if payload exceeds PI threshold, PI writes it to inspector workspace `raw/`
+- inspector is instructed to inspect from file using `execute_python` or `read_file`
+
+### 5) PI Controls Terminal State
+
+- `mark_routine_shipped`: attaches winning attempt to a spec
+- `mark_routine_failed`: guarded by minimum failed-attempt evidence
+- `mark_complete`: pipeline-level completion only (not per-spec)
+  - requires all specs addressed
+  - requires at least one shipped routine
+  - builds `RoutineCatalog`
+
+### 6) Persistence and Recovery
+
+- PI calls persistence hooks on every ledger mutation.
+- Pipeline writes incremental artifacts so crashes do not lose progress.
+- `run_pi_with_recovery` recreates PI with preserved ledger for continuation.
+
+### 7) Workspace Topology in Current Pipeline
+
+Created by run script (not by agent classes):
+
+- `agent_workspaces/PI`
+- `agent_workspaces/worker_1`, `agent_workspaces/worker_2`, ...
+- `agent_workspaces/inspector_1`, `agent_workspaces/inspector_2`, ...
+
+All receive mounted capture inputs in `raw/`, enabling direct Python/file inspection during autonomous runs.
+
+## Diagrams
+
+### Phase Diagram
+
+```mermaid
+flowchart LR
+    P0["Phase 0\nMonitor + Capture"] --> P1["Phase 1\nParallel Exploration"]
+    P1 --> P2["Phase 2\nPI Orchestration"]
+    P2 --> P3["Phase 3\nOutputs + Analysis"]
+
+    subgraph S0["Phase 0 Outputs"]
+      C1["network/events.jsonl"]
+      C2["storage/events.jsonl"]
+      C3["dom/events.jsonl"]
+      C4["interaction/events.jsonl"]
+      C5["window_properties/events.jsonl"]
+    end
+
+    P0 --> S0
+
+    subgraph S1["Phase 1 Outputs"]
+      E1["exploration/network.json"]
+      E2["exploration/storage.json"]
+      E3["exploration/dom.json"]
+      E4["exploration/ui.json"]
+    end
+
+    P1 --> S1
+
+    subgraph S2["Phase 2 Outputs"]
+      L["ledger.json"]
+      R["routines/*.json"]
+      A["attempts/*.json"]
+      AR["attempt_records/*.json"]
+      T["agent_threads/*.json"]
+    end
+
+    P2 --> S2
+
+    subgraph S3["Phase 3 Outputs"]
+      C["catalog.json"]
+      AN["analysis.txt (optional)"]
+    end
+
+    P3 --> S3
+```
+
+### Data Flow Diagram
+
+```mermaid
+flowchart TD
+    MON["bluebox-monitor"] --> CAP["cdp_captures/*"]
+
+    CAP --> NL["NetworkDataLoader"]
+    CAP --> SL["StorageDataLoader"]
+    CAP --> DL["DOMDataLoader"]
+    CAP --> IL["InteractionsDataLoader"]
+    CAP --> WL["WindowPropertyDataLoader"]
+
+    NL --> NEXP["NetworkSpecialist"]
+    SL --> SEXP["ValueTraceResolverSpecialist"]
+    WL --> SEXP
+    DL --> DEXP["DOMSpecialist"]
+    IL --> IEXP["InteractionSpecialist"]
+    DL --> IEXP
+
+    NEXP --> NSUM["NetworkExplorationSummary"]
+    SEXP --> SSUM["StorageExplorationSummary"]
+    DEXP --> DSUM["DOMExplorationSummary"]
+    IEXP --> USUM["UIExplorationSummary"]
+
+    NSUM --> ES["Exploration Summaries Map"]
+    SSUM --> ES
+    DSUM --> ES
+    USUM --> ES
+
+    ES --> PI["PrincipalInvestigator"]
+    PI --> LED["DiscoveryLedger"]
+
+    PI --> EXP["ExperimentEntry + Task"]
+    EXP --> WK["ExperimentWorker"]
+    WK --> EOUT["Experiment outputs"]
+    EOUT --> LED
+
+    PI --> SUB["submit_routine"]
+    SUB --> EXEC["Routine.execute"]
+    EXEC --> EXR["RoutineExecutionResultWithMetadata"]
+    EXR --> INS["RoutineInspector"]
+    INS --> INR["RoutineInspectionResult"]
+    INR --> ATT["RoutineAttempt"]
+    ATT --> LED
+
+    LED --> CAT["RoutineCatalog"]
+    CAT --> CJSON["catalog.json"]
+```
diff --git a/docs/chrome-debug-mode-explanation.md b/docs/chrome-debug-mode-explanation.md
deleted file mode 100644
index 0726e6d5..00000000
--- a/docs/chrome-debug-mode-explanation.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Chrome Debug Mode is Active ✅
-
-If you're seeing this page while running the quickstart script, **Chrome has been successfully launched in debug mode**.
-
-## Current Status
-
-- ✅ **Chrome is running in debug mode**
-- ⏸️ **Monitoring has NOT started yet**
-
-## What's Next?
-
-Go back to your terminal and follow the prompts. The quickstart script will:
-1. Start monitoring (Step 2) - a new tab will open for you to perform your actions
-2. Discover routines (Step 3) - analyze what you captured
-3. Show execution instructions (Step 4)
-
-You can close this tab and continue with the workflow in your terminal. 🚀
-
diff --git a/routine_discovery.md b/docs/routine_discovery.md
similarity index 100%
rename from routine_discovery.md
rename to docs/routine_discovery.md
diff --git a/example_data/example_routines/massachusetts_corp_search_routine.json b/example_data/example_routines/massachusetts_corp_search_routine.json
index 49de804b..98f9f4e2 100644
--- a/example_data/example_routines/massachusetts_corp_search_routine.json
+++ b/example_data/example_routines/massachusetts_corp_search_routine.json
@@ -1,12 +1,12 @@
 {
-  "name": "massachusetts_corp_search",
-  "description": "Search for corporations and business entities registered in Massachusetts using the Secretary of State's corporate database.",
+  "name": "search_ma_sec_state_corporations",
+  "description": "Searches the Massachusetts Secretary of the Commonwealth corporation index at corp.sec.state.ma.us for the provided entity_name and returns structured result rows containing entity_name, id_number, old_id_number, address, and details link.",
   "parameters": [
     {
       "name": "entity_name",
       "type": "string",
       "required": true,
-      "description": "The name of the corporation or entity to search for (e.g., 'Microsoft Corporation')",
+      "description": "Corporation or business entity name query used in the Massachusetts corp search form. Partial-name matching is supported by the site (example: 'Vectorly Inc.').",
       "default": null,
       "examples": [],
       "min_length": null,
@@ -101,4 +101,4 @@
       "session_storage_key": "corp_search_results"
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/pyproject.toml b/pyproject.toml
index bcf0681b..357015cd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,7 +67,6 @@ dev = [
 # Core commands
 bluebox-monitor = "bluebox.scripts.browser_monitor:main"
 bluebox-discover = "bluebox.scripts.discover_routine:main"
-bluebox-routine-discovery-agent-beta = "bluebox.scripts.run_routine_discovery_agent_beta:main"
 bluebox-agent-adapter = "bluebox.scripts.agent_http_adapter:main"
 bluebox-execute = "bluebox.scripts.execute_routine:main"
 
@@ -76,11 +75,18 @@ bluebox-guide = "bluebox.scripts.run_guide_agent:main"
 
 # Specialist agents
 bluebox-agent = "bluebox.scripts.run_bluebox_agent:main"
+bluebox-dumb-agent = "bluebox.scripts.run_dumb_agent:main"
+bluebox-experiment-worker-agent = "bluebox.scripts.specialists.run_experiment_worker_agent:main"
+bluebox-routine-inspector-agent = "bluebox.scripts.specialists.run_routine_inspector_agent:main"
+bluebox-dom-specialist = "bluebox.scripts.specialists.run_dom_specialist:main"
 bluebox-network-specialist = "bluebox.scripts.specialists.run_network_specialist:main"
 bluebox-js-specialist = "bluebox.scripts.specialists.run_js_specialist:main"
 bluebox-interaction-specialist = "bluebox.scripts.specialists.run_interaction_specialist:main"
 bluebox-value-trace-resolver-specialist = "bluebox.scripts.specialists.run_value_trace_resolver_specialist:main"
 
+# API indexing pipeline
+bluebox-api-index = "bluebox.scripts.api_indexing.run_api_indexing:main"
+
 # Dev tools
 bluebox-benchmarks = "bluebox.scripts.run_benchmarks:main"
 
diff --git a/tests/data/input/dom_events/dom_basic.jsonl b/tests/data/input/dom_events/dom_basic.jsonl
new file mode 100644
index 00000000..72f717db
--- /dev/null
+++ b/tests/data/input/dom_events/dom_basic.jsonl
@@ -0,0 +1,4 @@
+{"timestamp":1700000001.0,"url":"https://www.example.com/","title":"Example Home","documents":[{"nodes":{"parentIndex":[-1,0,1,2,3,2,5,6,7,6,9,6,11,5,13,2,2,2,17,2,5],"nodeType":[9,1,1,1,3,1,1,1,3,1,3,1,3,1,3,1,1,1,3,1,1],"nodeName":[0,1,2,3,13,4,5,6,13,7,13,6,13,7,13,37,37,38,13,38,6],"nodeValue":[-1,-1,-1,-1,14,-1,-1,-1,15,-1,16,-1,17,-1,-1,-1,-1,-1,46,-1,-1],"backendNodeId":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21],"attributes":[[],[],[],[],[],[],[18,19,20,21],[22,23,24,25,26,27],[],[28,29],[],[30,31,32,33],[],[34,35],[],[24,39,41,40],[24,42,41,43],[18,44,22,45],[],[47,48],[22,49,24,50,32,51]],"shadowRootType":{"index":[],"value":[]},"textValue":{"index":[],"value":[]},"inputValue":{"index":[7,20],"value":[36,51]},"inputChecked":{"index":[]},"optionSelected":{"index":[]},"contentDocumentIndex":{"index":[],"value":[]},"pseudoType":{"index":[],"value":[]},"pseudoIdentifier":{"index":[],"value":[]},"isClickable":{"index":[7,9,11,13,20]},"currentSourceURL":{"index":[],"value":[]},"originURL":{"index":[],"value":[]}}}],"strings":["#document","HTML","BODY","H1","DIV","FORM","INPUT","A","BUTTON","TABLE","TR","TH","TD","#text","Welcome to Example","Sign up today","Learn more","Sign Up","id","signup-form","action","/api/signup","type","email","name","email_field","placeholder","Enter your email","href","https://www.example.com/about","type","submit","value","Sign Up","href","/login","","META","SCRIPT","csrf-token","abc123def456","content","viewport","width=device-width","__NEXT_DATA__","application/json","{\"page\":\"/\",\"props\":{\"user\":null}}","src","/js/app.js","hidden","csrf_token","hidden_csrf_value"],"computed_styles":["display","visibility","opacity"]}
+{"timestamp": 1700000010.0, "url": "https://www.example.com/search", "title": "Search — Example", "documents": [{"nodes": {"parentIndex": [-1, 0, 1, 2, 3, 3, 5, 5, 7, 7, 9, 3, 11, 3, 13], "nodeType": [9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 3], "nodeName": [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 13, 9, 13, 10, 13], "nodeValue": [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, -1, 15, -1, 16], "backendNodeId": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "attributes": [[], [], [], [17, 18, 19, 20], [21, 22], [], [23, 24, 25, 26, 27, 28], [29, 30], [31, 32], [31, 33], [], [34, 35], [], [36, 37], []], "shadowRootType": {"index": [], "value": []}, "textValue": {"index": [], "value": []}, "inputValue": {"index": [6], "value": [38]}, "inputChecked": {"index": []}, "optionSelected": {"index": [9]}, "contentDocumentIndex": {"index": [], "value": []}, "pseudoType": {"index": [], "value": []}, "pseudoIdentifier": {"index": [], "value": []}, "isClickable": {"index": [6, 11, 13]}, "currentSourceURL": {"index": [], "value": []}, "originURL": {"index": [], "value": []}}, "layout": {"nodeIndex": [], "styles": [], "bounds": [], "text": [], "stackingContexts": {"index": []}, "offsetRects": [], "scrollRects": [], "clientRects": []}, "textBoxes": {"layoutIndex": [], "bounds": [], "start": [], "length": []}}], "strings": ["#document", "HTML", "BODY", "FORM", "H1", "DIV", "INPUT", "SELECT", "OPTION", "BUTTON", "A", "TABLE", "TR", "#text", "All Categories", "Search", "View Results", "action", "/api/search", "method", "GET", "class", "search-header", "type", "text", "name", "query", "placeholder", "Search for anything...", "name", "category", "value", "all", "electronics", "class", "search-btn", "href", "/results", "", "origin", "destination", "from", "to"], "computed_styles": ["display", "visibility", "opacity"]}
+{"timestamp": 1700000020.0, "url": "https://www.example.com/search?q=laptops", "title": "Search Results — Example", "documents": [{"nodes": {"parentIndex": [-1, 0, 1, 2, 2, 4, 4, 6, 6, 8, 2, 10, 4, 12, 4, 14], "nodeType": [9, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3], "nodeName": [0, 1, 2, 3, 4, 5, 5, 11, 6, 11, 7, 11, 5, 11, 8, 11], "nodeValue": [-1, -1, -1, -1, -1, -1, -1, 12, -1, 13, -1, 14, -1, 15, -1, 16], "backendNodeId": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], "attributes": [[], [], [], [17, 18], [19, 20], [], [], [], [21, 22], [], [23, 24], [], [], [], [25, 26], []], "shadowRootType": {"index": [], "value": []}, "textValue": {"index": [], "value": []}, "inputValue": {"index": [], "value": []}, "inputChecked": {"index": []}, "optionSelected": {"index": []}, "contentDocumentIndex": {"index": [], "value": []}, "pseudoType": {"index": [], "value": []}, "pseudoIdentifier": {"index": [], "value": []}, "isClickable": {"index": [8, 10, 14]}, "currentSourceURL": {"index": [], "value": []}, "originURL": {"index": [], "value": []}}}], "strings": ["#document", "HTML", "BODY", "H1", "TABLE", "TR", "A", "BUTTON", "A", "TH", "TD", "#text", "Showing 15 results for laptops", "Product", "Next Page", "$999.99", "View All", "class", "results-header", "class", "results-table", "href", "https://www.example.com/product/123", "class", "next-page-btn", "href", "/cart"], "computed_styles": ["display", "visibility", "opacity"]}
+{"timestamp":1700000030.0,"url":"https://auth.example.com/login","title":"Login \u2014 Example","documents":[{"nodes":{"parentIndex":[-1,0,1,2,3,3,5,5,7,5,9,3,11,3,13,2,3],"nodeType":[9,1,1,1,1,1,1,1,1,1,3,1,3,1,3,1,1],"nodeName":[0,1,2,3,4,5,6,7,6,8,12,9,12,7,12,42,6],"nodeValue":[-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,13,-1,14,-1,15,-1,-1],"backendNodeId":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17],"attributes":[[],[],[],[16,17,18,19],[20,21],[22,23],[24,25,26,27,28,29],[30,31],[24,32,26,33,28,34],[35,36],[],[37,38],[],[39,40],[],[26,43,44,45],[24,46,26,47,49,48]],"shadowRootType":{"index":[],"value":[]},"textValue":{"index":[],"value":[]},"inputValue":{"index":[6,8,16],"value":[41,41,48]},"inputChecked":{"index":[]},"optionSelected":{"index":[]},"contentDocumentIndex":{"index":[],"value":[]},"pseudoType":{"index":[],"value":[]},"pseudoIdentifier":{"index":[],"value":[]},"isClickable":{"index":[6,8,9,11,13]},"currentSourceURL":{"index":[],"value":[]},"originURL":{"index":[],"value":[]}}}],"strings":["#document","HTML","BODY","FORM","H1","DIV","INPUT","LABEL","BUTTON","A","TABLE","TR","#text","Sign In","Forgot your password?","Login","action","/api/auth/login","method","POST","class","login-header","class","input-group","type","text","name","username","placeholder","Enter username","for","password","type","password","placeholder","Enter password","class","submit-btn","href","/forgot-password","","","META","csrf-token","content","login_csrf_abc789","hidden","_token","form_token_xyz","value"],"computed_styles":["display","visibility","opacity"]}
diff --git a/tests/data/input/dom_events/dom_empty.jsonl b/tests/data/input/dom_events/dom_empty.jsonl
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/tests/data/input/dom_events/dom_empty.jsonl
@@ -0,0 +1 @@
+
diff --git a/tests/data/input/dom_events/dom_malformed.jsonl b/tests/data/input/dom_events/dom_malformed.jsonl
new file mode 100644
index 00000000..34654b1a
--- /dev/null
+++ b/tests/data/input/dom_events/dom_malformed.jsonl
@@ -0,0 +1,4 @@
+not valid json at all
+{"timestamp": 1700000001.0, "url": "https://good.example.com/", "title": "Good Page", "documents": [{"nodes": {}}], "strings": ["#document", "HTML", "BODY", "Hello World", "Good content here"], "computed_styles": ["display"]}
+{"this": "is missing required fields"}
+{"timestamp": 1700000002.0, "url": "https://good.example.com/page2", "title": "Good Page 2", "documents": [{"nodes": {}}], "strings": ["#document", "HTML", "BODY", "Another page", "More good content"], "computed_styles": ["display"]}
diff --git a/tests/unit/agents/specialists/test_abstract_specialist.py b/tests/unit/agents/specialists/test_abstract_specialist.py
index 3ed3bce2..964bc8b7 100644
--- a/tests/unit/agents/specialists/test_abstract_specialist.py
+++ b/tests/unit/agents/specialists/test_abstract_specialist.py
@@ -1,21 +1,26 @@
 """
 tests/unit/agents/specialists/test_abstract_specialist.py
 
-Comprehensive unit tests for AbstractSpecialist base class and @agent_tool decorator.
+Comprehensive unit tests for autonomous/finalize behavior on AbstractAgent and @agent_tool decorator.
 """
 
+import tempfile
+from pathlib import Path
 from typing import Any
 from unittest.mock import MagicMock
 
 import pytest
 from pydantic import BaseModel
 
-from bluebox.agents.abstract_agent import AgentCard, agent_tool, _ToolMeta
-from bluebox.agents.specialists.abstract_specialist import (
-    AbstractSpecialist,
-    AutonomousConfig,
-    RunMode,
+from bluebox.agents.abstract_agent import (
+    AbstractAgent,
+    AgentCard,
+    AgentExecutionMode,
+    AutonomousRunConfig,
+    agent_tool,
+    _ToolMeta,
 )
+from bluebox.workspace import LocalAgentWorkspace
 
 
 # =============================================================================
@@ -28,9 +33,9 @@ class DummyResult(BaseModel):
     value: str
 
 
-class ConcreteSpecialist(AbstractSpecialist):
+class ConcreteSpecialist(AbstractAgent):
     """
-    Concrete implementation of AbstractSpecialist for testing.
+    Concrete implementation of AbstractAgent with autonomous support for testing.
 
     Has a mix of tools with different availability conditions and signatures.
     """
@@ -39,6 +44,9 @@ class ConcreteSpecialist(AbstractSpecialist):
 
     def __init__(self, **kwargs: Any) -> None:
         self._autonomous_result: DummyResult | None = None
+        if "workspace" not in kwargs:
+            workspace_dir = Path(tempfile.mkdtemp(prefix="bluebox-abstract-agent-autonomous-test-"))
+            kwargs["workspace"] = LocalAgentWorkspace.from_directory_path(workspace_dir)
         super().__init__(**kwargs)
 
     def _get_system_prompt(self) -> str:
@@ -161,7 +169,7 @@ def autonomous_specialist(mock_emit: MagicMock) -> ConcreteSpecialist:
     """Create a ConcreteSpecialist in autonomous mode past min_iterations."""
     spec = ConcreteSpecialist(
         emit_message_callable=mock_emit,
-        run_mode=RunMode.AUTONOMOUS,
+        execution_mode=AgentExecutionMode.AUTONOMOUS,
     )
     spec._autonomous_iteration = 5  # past default min_iterations of 3
     return spec
@@ -322,16 +330,17 @@ def test_finds_all_decorated_methods(self, specialist: ConcreteSpecialist) -> No
             "with_explicit_schema",
             "finalize",
             "add_note",
-            # Generic finalize tools from AbstractSpecialist (both with and without output schema)
+            # Generic finalize tools from AbstractAgent (both with and without output schema)
             "finalize_with_output",
             "finalize_with_failure",
             "finalize_result",
             "finalize_failure",
-            # Documentation tools from AbstractAgent
-            "search_docs",
-            "get_doc_file",
-            "search_docs_by_terms",
-            "search_docs_by_regex",
+            # Generic code execution tool from AbstractAgent (availability-gated)
+            "execute_python",
+            # Unified file tools from AbstractAgent
+            "list_files",
+            "read_file",
+            "search_files",
         }
         assert tool_names == expected
 
@@ -398,7 +407,7 @@ def test_callable_availability_evaluated(self, specialist: ConcreteSpecialist) -
         assert "finalize_gated" not in specialist._registered_tool_names
 
         # Switch to autonomous mode and advance past min_iterations
-        specialist.run_mode = RunMode.AUTONOMOUS
+        specialist.execution_mode = AgentExecutionMode.AUTONOMOUS
         specialist._autonomous_iteration = 5
         specialist._sync_tools()
         assert "finalize_gated" in specialist._registered_tool_names
@@ -419,7 +428,7 @@ def test_availability_cycle_available_unavailable_available(
     ) -> None:
         """Tool availability tracks state changes: available → unavailable → available."""
         # Phase 1: Make finalize_gated available (autonomous mode, past min_iterations)
-        specialist.run_mode = RunMode.AUTONOMOUS
+        specialist.execution_mode = AgentExecutionMode.AUTONOMOUS
         specialist._autonomous_iteration = 5
         specialist._sync_tools()
 
@@ -428,7 +437,7 @@ def test_availability_cycle_available_unavailable_available(
         assert result == {"finalized": True}
 
         # Phase 2: Make unavailable (back to conversational mode)
-        specialist.run_mode = RunMode.CONVERSATIONAL
+        specialist.execution_mode = AgentExecutionMode.CONVERSATIONAL
         specialist._sync_tools()
 
         assert "finalize_gated" not in specialist._registered_tool_names
@@ -437,7 +446,7 @@ def test_availability_cycle_available_unavailable_available(
         assert "not currently available" in result["error"]
 
         # Phase 3: Make available again (back to autonomous, past min_iterations)
-        specialist.run_mode = RunMode.AUTONOMOUS
+        specialist.execution_mode = AgentExecutionMode.AUTONOMOUS
         specialist._autonomous_iteration = 10
         specialist._sync_tools()
 
@@ -447,8 +456,8 @@ def test_availability_cycle_available_unavailable_available(
 
     def test_availability_changes_mid_iteration(self, specialist: ConcreteSpecialist) -> None:
         """Tool becomes available mid-session as iteration count increases."""
-        specialist.run_mode = RunMode.AUTONOMOUS
-        specialist._autonomous_config = AutonomousConfig(min_iterations=3, max_iterations=10)
+        specialist.execution_mode = AgentExecutionMode.AUTONOMOUS
+        specialist._autonomous_config = AutonomousRunConfig(min_iterations=3, max_iterations=10)
 
         # Iteration 1: not available
         specialist._autonomous_iteration = 1
@@ -638,7 +647,7 @@ class TestCanFinalizeProperty:
 
     def test_false_in_conversational_mode(self, specialist: ConcreteSpecialist) -> None:
         """can_finalize is False in conversational mode regardless of iteration."""
-        specialist.run_mode = RunMode.CONVERSATIONAL
+        specialist.execution_mode = AgentExecutionMode.CONVERSATIONAL
         specialist._autonomous_iteration = 100  # even with high iteration
         assert specialist.can_finalize is False
 
@@ -646,9 +655,9 @@ def test_false_before_min_iterations(self, mock_emit: MagicMock) -> None:
         """can_finalize is False in autonomous mode before min_iterations."""
         specialist = ConcreteSpecialist(
             emit_message_callable=mock_emit,
-            run_mode=RunMode.AUTONOMOUS,
+            execution_mode=AgentExecutionMode.AUTONOMOUS,
         )
-        specialist._autonomous_config = AutonomousConfig(min_iterations=5, max_iterations=10)
+        specialist._autonomous_config = AutonomousRunConfig(min_iterations=5, max_iterations=10)
         specialist._autonomous_iteration = 3
 
         assert specialist.can_finalize is False
@@ -657,9 +666,9 @@ def test_true_at_min_iterations(self, mock_emit: MagicMock) -> None:
         """can_finalize is True at exactly min_iterations."""
         specialist = ConcreteSpecialist(
             emit_message_callable=mock_emit,
-            run_mode=RunMode.AUTONOMOUS,
+            execution_mode=AgentExecutionMode.AUTONOMOUS,
         )
-        specialist._autonomous_config = AutonomousConfig(min_iterations=5, max_iterations=10)
+        specialist._autonomous_config = AutonomousRunConfig(min_iterations=5, max_iterations=10)
         specialist._autonomous_iteration = 5
 
         assert specialist.can_finalize is True
@@ -670,36 +679,36 @@ def test_true_after_min_iterations(self, autonomous_specialist: ConcreteSpeciali
 
 
 class TestAutonomousConfig:
-    """Tests for AutonomousConfig."""
+    """Tests for AutonomousRunConfig."""
 
     def test_default_values(self) -> None:
-        """AutonomousConfig has sensible defaults."""
-        config = AutonomousConfig()
+        """AutonomousRunConfig has sensible defaults."""
+        config = AutonomousRunConfig()
         assert config.min_iterations == 3
         assert config.max_iterations == 10
 
     def test_custom_values(self) -> None:
-        """AutonomousConfig accepts custom values."""
-        config = AutonomousConfig(min_iterations=5, max_iterations=20)
+        """AutonomousRunConfig accepts custom values."""
+        config = AutonomousRunConfig(min_iterations=5, max_iterations=20)
         assert config.min_iterations == 5
         assert config.max_iterations == 20
 
 
 # =============================================================================
-# Tests for RunMode enum
+# Tests for AgentExecutionMode enum
 # =============================================================================
 
 
 class TestRunMode:
-    """Tests for RunMode enum."""
+    """Tests for AgentExecutionMode enum."""
 
     def test_conversational_value(self) -> None:
-        """RunMode.CONVERSATIONAL has correct string value."""
-        assert RunMode.CONVERSATIONAL == "conversational"
+        """AgentExecutionMode.CONVERSATIONAL has correct string value."""
+        assert AgentExecutionMode.CONVERSATIONAL == "conversational"
 
     def test_autonomous_value(self) -> None:
-        """RunMode.AUTONOMOUS has correct string value."""
-        assert RunMode.AUTONOMOUS == "autonomous"
+        """AgentExecutionMode.AUTONOMOUS has correct string value."""
+        assert AgentExecutionMode.AUTONOMOUS == "autonomous"
 
 
 # =============================================================================
@@ -712,7 +721,7 @@ class TestSpecialistLifecycle:
 
     def test_initial_state(self, specialist: ConcreteSpecialist) -> None:
         """Specialist initializes with correct default state."""
-        assert specialist.run_mode == RunMode.CONVERSATIONAL
+        assert specialist.execution_mode == AgentExecutionMode.CONVERSATIONAL
         assert specialist._autonomous_iteration == 0
         assert specialist.can_finalize is False
 
@@ -723,7 +732,7 @@ def test_reset_clears_state(self, autonomous_specialist: ConcreteSpecialist) ->
 
         autonomous_specialist.reset()
 
-        assert autonomous_specialist.run_mode == RunMode.CONVERSATIONAL
+        assert autonomous_specialist.execution_mode == AgentExecutionMode.CONVERSATIONAL
         assert autonomous_specialist._autonomous_iteration == 0
         assert autonomous_specialist._autonomous_result is None
         assert autonomous_specialist.can_finalize is False
diff --git a/tests/unit/agents/test_abstract_agent.py b/tests/unit/agents/test_abstract_agent.py
index 3e36f84e..a08ddc4b 100644
--- a/tests/unit/agents/test_abstract_agent.py
+++ b/tests/unit/agents/test_abstract_agent.py
@@ -16,14 +16,22 @@
 """
 
 import json
+import tempfile
 from pathlib import Path
 from typing import Any
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
 import pytest
 from pydantic import BaseModel, Field
 
-from bluebox.agents.abstract_agent import AbstractAgent, AgentCard, agent_tool, _ToolMeta
+from bluebox.agents.abstract_agent import (
+    AbstractAgent,
+    AgentCard,
+    ToolResultPersistMode,
+    agent_tool,
+    _ToolMeta,
+)
+from bluebox.workspace import LocalAgentWorkspace
 from bluebox.data_models.llms.interaction import (
     Chat,
     ChatRole,
@@ -55,6 +63,12 @@ class ConcreteAgent(AbstractAgent):
 
     AGENT_CARD = AgentCard(description="Test agent for unit tests.")
 
+    def __init__(self, **kwargs: Any) -> None:
+        if "workspace" not in kwargs:
+            workspace_dir = Path(tempfile.mkdtemp(prefix="bluebox-abstract-agent-test-"))
+            kwargs["workspace"] = LocalAgentWorkspace.from_directory_path(workspace_dir)
+        super().__init__(**kwargs)
+
     def _get_system_prompt(self) -> str:
         return "You are a test agent."
 
@@ -126,6 +140,52 @@ def _search(self, params: SearchParams) -> dict[str, Any]:
             "tags": params.tags,
         }
 
+    @agent_tool(persist=ToolResultPersistMode.ALWAYS)
+    def _persist_always(self, text: str = "ok") -> dict[str, Any]:
+        """
+        Tool that always persists results.
+
+        Args:
+            text: Value to return in the payload.
+        """
+        return {"text": text}
+
+    @agent_tool(
+        persist=ToolResultPersistMode.OVERFLOW,
+        max_characters=50,
+    )
+    def _persist_overflow(self, text: str) -> dict[str, Any]:
+        """
+        Tool that persists only when output exceeds max_characters.
+
+        Args:
+            text: Payload content.
+        """
+        return {"text": text}
+
+    @agent_tool(persist=ToolResultPersistMode.NEVER)
+    def _persist_never(self, text: str = "ok") -> dict[str, Any]:
+        """
+        Tool that never persists results.
+
+        Args:
+            text: Value to return in the payload.
+        """
+        return {"text": text}
+
+    @agent_tool(
+        persist=ToolResultPersistMode.ALWAYS,
+        token_optimized=True,
+    )
+    def _persist_always_token_optimized(self) -> dict[str, Any]:
+        """Always persists raw result, returns token-optimized payload."""
+        return {"status": "ok", "items": [1, 2, 3]}
+
+    @agent_tool(token_optimized=True)
+    def _token_optimized_no_persist(self) -> dict[str, Any]:
+        """Token-optimized tool with no persistence."""
+        return {"status": "ok"}
+
 
 @pytest.fixture
 def mock_emit() -> MagicMock:
@@ -177,6 +237,20 @@ def agent_with_docs(mock_emit: MagicMock, tmp_path: Path) -> ConcreteAgent:
     )
 
 
+@pytest.fixture
+def agent_docs_only(mock_emit: MagicMock, tmp_path: Path) -> ConcreteAgent:
+    """Agent with docs loader but no workspace configured."""
+    docs_dir = tmp_path / "docs_only"
+    docs_dir.mkdir()
+    (docs_dir / "guide.md").write_text("# Docs Only\n\ncontent here\n")
+    loader = DocumentationDataLoader(documentation_paths=[str(docs_dir)])
+    return ConcreteAgent(
+        emit_message_callable=mock_emit,
+        workspace=None,
+        documentation_data_loader=loader,
+    )
+
+
 # =============================================================================
 # Initialization
 # =============================================================================
@@ -249,6 +323,51 @@ def test_persist_thread_not_called_with_existing_thread(self, mock_emit: MagicMo
     def test_documentation_data_loader_stored(self, agent_with_docs: ConcreteAgent) -> None:
         assert agent_with_docs._documentation_data_loader is not None
 
+    def test_workspace_attached_on_init(self, agent: ConcreteAgent) -> None:
+        assert isinstance(agent._workspace, LocalAgentWorkspace)
+
+    def test_workspace_can_be_omitted(self, mock_emit: MagicMock) -> None:
+        agent = ConcreteAgent(emit_message_callable=mock_emit, workspace=None)
+        assert agent._workspace is None
+        assert agent.has_workspace is False
+
+    def test_code_execution_disabled_by_default(self, agent: ConcreteAgent) -> None:
+        assert agent._allow_code_execution is False
+        assert agent._code_execution_globals == {}
+
+    def test_code_execution_globals_require_enabled(self, mock_emit: MagicMock) -> None:
+        with pytest.raises(ValueError, match="code_execution_globals must be empty"):
+            ConcreteAgent(
+                emit_message_callable=mock_emit,
+                allow_code_execution=False,
+                code_execution_globals={"x": 1},
+            )
+
+    def test_code_execution_globals_stored_when_enabled(self, mock_emit: MagicMock) -> None:
+        configured = {"items": [1, 2, 3], "name": "demo"}
+        agent = ConcreteAgent(
+            emit_message_callable=mock_emit,
+            allow_code_execution=True,
+            code_execution_globals=configured,
+        )
+        assert agent._allow_code_execution is True
+        assert agent._code_execution_globals == configured
+
+    def test_code_execution_prompt_includes_compute_only_notice_without_workspace(
+        self,
+        mock_emit: MagicMock,
+    ) -> None:
+        agent = ConcreteAgent(
+            emit_message_callable=mock_emit,
+            workspace=None,
+            allow_code_execution=True,
+            code_execution_globals={},
+        )
+        section = agent._generate_code_execution_prompt()
+        assert "Code Execution Environment" in section
+        assert "compute-only" in section
+        assert "open()" in section
+
 
 # =============================================================================
 # Chat management
@@ -395,8 +514,12 @@ def test_finds_all_decorated_methods(self) -> None:
         expected = {
             "echo", "add_numbers", "disabled_tool", "gated_tool",
             "no_params", "optional_params", "raises_error", "search",
-            # Documentation tools from AbstractAgent
-            "search_docs", "get_doc_file", "search_docs_by_terms", "search_docs_by_regex",
+            "persist_always", "persist_overflow", "persist_never", "persist_always_token_optimized",
+            "token_optimized_no_persist",
+            "add_note", "finalize_with_output", "finalize_with_failure", "finalize_result", "finalize_failure",
+            "execute_python",
+            # Unified file tools from AbstractAgent
+            "list_files", "read_file", "search_files",
         }
         assert tool_names == expected
 
@@ -452,21 +575,55 @@ def test_callable_availability_true(self, agent: ConcreteAgent) -> None:
         agent._sync_tools()
         assert "gated_tool" in agent._registered_tool_names
 
-    def test_docs_tools_not_registered_without_loader(self, agent: ConcreteAgent) -> None:
-        """Documentation tools are not registered when no loader is provided."""
+    def test_file_tools_registered_without_loader(self, agent: ConcreteAgent) -> None:
+        """Unified file tools are registered when workspace is available."""
         agent._sync_tools()
-        assert "search_docs" not in agent._registered_tool_names
-        assert "get_doc_file" not in agent._registered_tool_names
-        assert "search_docs_by_terms" not in agent._registered_tool_names
-        assert "search_docs_by_regex" not in agent._registered_tool_names
+        assert "list_files" in agent._registered_tool_names
+        assert "read_file" in agent._registered_tool_names
+        assert "search_files" in agent._registered_tool_names
 
-    def test_docs_tools_registered_with_loader(self, agent_with_docs: ConcreteAgent) -> None:
-        """Documentation tools are registered when loader is provided."""
+    def test_file_tools_registered_with_loader(self, agent_with_docs: ConcreteAgent) -> None:
+        """Unified file tools are also registered when docs loader is available."""
         agent_with_docs._sync_tools()
-        assert "search_docs" in agent_with_docs._registered_tool_names
-        assert "get_doc_file" in agent_with_docs._registered_tool_names
-        assert "search_docs_by_terms" in agent_with_docs._registered_tool_names
-        assert "search_docs_by_regex" in agent_with_docs._registered_tool_names
+        assert "list_files" in agent_with_docs._registered_tool_names
+        assert "read_file" in agent_with_docs._registered_tool_names
+        assert "search_files" in agent_with_docs._registered_tool_names
+
+    def test_file_tools_not_registered_without_workspace_or_loader(self, mock_emit: MagicMock) -> None:
+        no_workspace_agent = ConcreteAgent(emit_message_callable=mock_emit, workspace=None)
+        no_workspace_agent._sync_tools()
+        assert "list_files" not in no_workspace_agent._registered_tool_names
+        assert "read_file" not in no_workspace_agent._registered_tool_names
+        assert "search_files" not in no_workspace_agent._registered_tool_names
+
+    def test_file_tools_registered_with_loader_and_no_workspace(self, agent_docs_only: ConcreteAgent) -> None:
+        agent_docs_only._sync_tools()
+        assert "list_files" in agent_docs_only._registered_tool_names
+        assert "read_file" in agent_docs_only._registered_tool_names
+        assert "search_files" in agent_docs_only._registered_tool_names
+
+    def test_execute_python_not_registered_when_disabled(self, agent: ConcreteAgent) -> None:
+        agent._sync_tools()
+        assert "execute_python" not in agent._registered_tool_names
+
+    def test_execute_python_registered_when_enabled(self, mock_emit: MagicMock) -> None:
+        code_agent = ConcreteAgent(
+            emit_message_callable=mock_emit,
+            allow_code_execution=True,
+            code_execution_globals={},
+        )
+        code_agent._sync_tools()
+        assert "execute_python" in code_agent._registered_tool_names
+
+    def test_execute_python_registered_without_workspace_when_enabled(self, mock_emit: MagicMock) -> None:
+        code_agent = ConcreteAgent(
+            emit_message_callable=mock_emit,
+            workspace=None,
+            allow_code_execution=True,
+            code_execution_globals={},
+        )
+        code_agent._sync_tools()
+        assert "execute_python" in code_agent._registered_tool_names
 
     def test_sync_clears_and_re_registers(self, agent: ConcreteAgent) -> None:
         """Calling _sync_tools multiple times doesn't duplicate registrations."""
@@ -519,6 +676,11 @@ def test_unavailable_tool(self, agent: ConcreteAgent) -> None:
         assert "error" in result
         assert "not currently available" in result["error"]
 
+    def test_execute_python_unavailable_when_disabled(self, agent: ConcreteAgent) -> None:
+        result = agent._execute_tool("execute_python", {"code": "print('hello')"})
+        assert "error" in result
+        assert "not currently available" in result["error"]
+
     def test_missing_required_param(self, agent: ConcreteAgent) -> None:
         result = agent._execute_tool("echo", {})
         assert "error" in result
@@ -579,6 +741,141 @@ def test_primitives_still_pass_through(self, agent: ConcreteAgent) -> None:
         result = agent._execute_tool("add_numbers", {"a": 3, "b": 7})
         assert result == {"sum": 10}
 
+    def test_execute_python_uses_configured_globals(
+        self,
+        mock_emit: MagicMock,
+        tmp_path: Path,
+    ) -> None:
+        code_agent = ConcreteAgent(
+            emit_message_callable=mock_emit,
+            allow_code_execution=True,
+            code_execution_globals={"value": 7},
+        )
+        code_agent._workspace = LocalAgentWorkspace.from_directory_path(tmp_path / "workspace")
+        result = code_agent._execute_tool("execute_python", {"code": "print(value + 5)"})
+        assert "error" not in result
+        assert "12" in result.get("output", "")
+
+    def test_execute_python_marks_raw_and_meta_read_only(
+        self,
+        mock_emit: MagicMock,
+        tmp_path: Path,
+    ) -> None:
+        code_agent = ConcreteAgent(
+            emit_message_callable=mock_emit,
+            allow_code_execution=True,
+            code_execution_globals={},
+        )
+        code_agent._workspace = LocalAgentWorkspace.from_directory_path(tmp_path / "workspace")
+
+        with patch(
+            "bluebox.agents.abstract_agent.execute_python_sandboxed",
+            return_value={"output": "(no output)"},
+        ) as mock_exec:
+            result = code_agent._execute_tool("execute_python", {"code": "print('ok')"})
+
+        assert "error" not in result
+        kwargs = mock_exec.call_args.kwargs
+        read_only_paths = kwargs["read_only_paths"]
+        assert str((code_agent._workspace.root_path / "raw").resolve()) in read_only_paths
+        assert str((code_agent._workspace.root_path / "meta").resolve()) in read_only_paths
+
+    def test_execute_python_without_workspace_uses_compute_only_sandbox(self, mock_emit: MagicMock) -> None:
+        code_agent = ConcreteAgent(
+            emit_message_callable=mock_emit,
+            workspace=None,
+            allow_code_execution=True,
+            code_execution_globals={"value": 7},
+        )
+        with patch(
+            "bluebox.agents.abstract_agent.execute_python_sandboxed",
+            return_value={"output": "7"},
+        ) as mock_exec:
+            result = code_agent._execute_tool("execute_python", {"code": "print(value)"})
+
+        assert "error" not in result
+        kwargs = mock_exec.call_args.kwargs
+        assert kwargs["code"] == "print(value)"
+        assert kwargs["extra_globals"] == {"value": 7}
+        assert "work_dir" not in kwargs
+        assert "read_only_paths" not in kwargs
+
+    def test_persist_always_wraps_result_and_saves_artifact(self, agent: ConcreteAgent) -> None:
+        result = agent._execute_tool("persist_always", {"text": "small"})
+        assert result["persist_mode"] == "always"
+        assert result["artifact_id"].startswith("a_")
+        assert "artifact_path" in result
+        assert "preview" in result
+        artifacts = agent._workspace.list_artifacts(source="raw")
+        assert len(artifacts) == 1
+        assert artifacts[0].artifact_id == result["artifact_id"]
+
+    def test_persist_overflow_under_limit_returns_raw_result(self, agent: ConcreteAgent) -> None:
+        result = agent._execute_tool("persist_overflow", {"text": "short"})
+        assert result == {"text": "short"}
+        artifacts = agent._workspace.list_artifacts(source="raw")
+        assert artifacts == []
+
+    def test_persist_overflow_over_limit_wraps_and_saves_artifact(self, agent: ConcreteAgent) -> None:
+        result = agent._execute_tool("persist_overflow", {"text": "x" * 200})
+        assert result["persist_mode"] == "overflow"
+        assert result["artifact_id"].startswith("a_")
+        assert result["truncated"] is True
+        assert "truncated" in result["preview"]
+        artifacts = agent._workspace.list_artifacts(source="raw")
+        assert len(artifacts) == 1
+        assert artifacts[0].artifact_id == result["artifact_id"]
+
+    def test_persist_never_returns_raw_and_does_not_save(self, agent: ConcreteAgent) -> None:
+        result = agent._execute_tool("persist_never", {"text": "abc"})
+        assert result == {"text": "abc"}
+        artifacts = agent._workspace.list_artifacts(source="raw")
+        assert artifacts == []
+
+    def test_persist_always_returns_raw_without_workspace(self, mock_emit: MagicMock) -> None:
+        no_workspace_agent = ConcreteAgent(emit_message_callable=mock_emit, workspace=None)
+        result = no_workspace_agent._execute_tool("persist_always", {"text": "small"})
+        assert result == {"text": "small"}
+
+    def test_workspace_scope_returns_error_without_workspace_when_docs_tools_enabled(
+        self,
+        agent_docs_only: ConcreteAgent,
+    ) -> None:
+        result = agent_docs_only._execute_tool(
+            "read_file",
+            {"scope": "workspace", "path": "output/x.txt"},
+        )
+        assert isinstance(result, str)
+        assert "workspace scope unavailable" in result
+
+    def test_docs_scope_works_without_workspace(self, agent_docs_only: ConcreteAgent) -> None:
+        result = agent_docs_only._execute_tool(
+            "read_file",
+            {"scope": "docs", "path": "guide.md"},
+        )
+        assert isinstance(result, str)
+        assert "Docs Only" in result
+
+    def test_token_optimized_persist_saves_raw_not_encoded(
+        self,
+        agent: ConcreteAgent,
+    ) -> None:
+        result = agent._execute_tool("persist_always_token_optimized", {})
+        assert isinstance(result, str)
+        assert "_token_optimized_note" in result
+
+        artifacts = agent._workspace.list_artifacts(source="raw")
+        assert len(artifacts) == 1
+        artifact_id = artifacts[0].artifact_id
+        stored = agent._workspace.read_artifact(artifact_id)
+        assert "error" not in stored
+        assert '"status": "ok"' in stored["content"]
+
+    def test_token_optimized_without_persist_has_no_note(self, agent: ConcreteAgent) -> None:
+        result = agent._execute_tool("token_optimized_no_persist", {})
+        assert isinstance(result, str)
+        assert "_token_optimized_note" not in result
+
 
 # =============================================================================
 # _auto_execute_tool
@@ -705,150 +1002,88 @@ def test_persist_thread_deferred_after_reset(self, mock_emit: MagicMock) -> None
 
 
 # =============================================================================
-# Documentation tools (functional tests with real DocumentationDataLoader)
+# Unified file tools (functional tests with real DocumentationDataLoader)
 # =============================================================================
 
 
-class TestDocumentationTools:
-    """Tests for the documentation tools on AbstractAgent.
-
-    Note: doc tools use @token_optimized, so _execute_tool returns a compact string
-    (not a dict) when the handler itself is invoked. Pre-dispatch validation errors
-    (unavailability, missing params) still return dicts.
-    """
-
-    # --- search_docs ---
+class TestFileTools:
+    """Tests for the unified file tools on AbstractAgent."""
 
-    def test_search_docs_finds_matches(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool("search_docs", {"query": "Installation"})
+    def test_list_files_workspace(self, agent: ConcreteAgent) -> None:
+        result = agent._execute_tool("list_files", {"scope": "workspace"})
         assert isinstance(result, str)
-        assert "files_with_matches" in result
-        assert "guide.md" in result or "Installation" in result
-
-    def test_search_docs_no_matches(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool("search_docs", {"query": "xyznonexistent123"})
-        assert isinstance(result, str)
-        assert "No matches found" in result
-
-    def test_search_docs_empty_query(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool("search_docs", {"query": ""})
-        assert isinstance(result, str)
-        assert "error" in result
+        assert "scope: workspace" in result
 
-    def test_search_docs_case_insensitive(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool("search_docs", {"query": "installation"})
+    def test_list_files_docs(self, agent_with_docs: ConcreteAgent) -> None:
+        result = agent_with_docs._execute_tool("list_files", {"scope": "docs"})
         assert isinstance(result, str)
-        assert "files_with_matches" in result
-
-    def test_search_docs_case_sensitive(self, agent_with_docs: ConcreteAgent) -> None:
-        result_upper = agent_with_docs._execute_tool(
-            "search_docs", {"query": "Installation", "case_sensitive": True},
-        )
-        assert "files_with_matches" in result_upper
-
-    def test_search_docs_filter_by_file_type(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool(
-            "search_docs", {"query": "def", "file_type": "code"},
+        assert "guide.md" in result
+        assert "main.py" in result
+
+    def test_read_file_workspace(self, agent: ConcreteAgent) -> None:
+        (agent._workspace.root_path / "output").mkdir(parents=True, exist_ok=True)
+        (agent._workspace.root_path / "output" / "x.txt").write_text("hello\nworld")
+        result = agent._execute_tool(
+            "read_file",
+            {"scope": "workspace", "path": "output/x.txt", "start_line": 1, "end_line": 1},
         )
         assert isinstance(result, str)
-        # Should match the code file
-        assert "main.py" in result or "files_with_matches" in result
+        assert "hello" in result
 
-    # --- get_doc_file ---
-
-    def test_get_doc_file_full_content(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool("get_doc_file", {"path": "guide.md"})
+    def test_read_file_docs(self, agent_with_docs: ConcreteAgent) -> None:
+        result = agent_with_docs._execute_tool("read_file", {"scope": "docs", "path": "guide.md"})
         assert isinstance(result, str)
         assert "User Guide" in result
-        assert "total_lines" in result
 
-    def test_get_doc_file_line_range(self, agent_with_docs: ConcreteAgent) -> None:
+    def test_search_files_docs_exact(self, agent_with_docs: ConcreteAgent) -> None:
         result = agent_with_docs._execute_tool(
-            "get_doc_file", {"path": "guide.md", "start_line": 1, "end_line": 3},
+            "search_files",
+            {"scope": "docs", "query": "Installation", "mode": "exact"},
         )
         assert isinstance(result, str)
-        assert "lines_shown: 1-3" in result
-
-    def test_get_doc_file_not_found(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool("get_doc_file", {"path": "nonexistent.md"})
-        assert isinstance(result, str)
-        assert "not found" in result
-
-    def test_get_doc_file_empty_path(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool("get_doc_file", {"path": ""})
-        assert isinstance(result, str)
-        assert "error" in result
-
-    def test_get_doc_file_code_file(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool("get_doc_file", {"path": "main.py"})
-        assert isinstance(result, str)
-        assert "def hello" in result
-
-    # --- search_docs_by_terms ---
+        assert "files_with_matches" in result
 
-    def test_search_by_terms_finds_results(self, agent_with_docs: ConcreteAgent) -> None:
+    def test_search_files_docs_terms(self, agent_with_docs: ConcreteAgent) -> None:
         result = agent_with_docs._execute_tool(
-            "search_docs_by_terms", {"terms": ["installation", "API"]},
+            "search_files",
+            {"scope": "docs", "query": "installation API", "mode": "terms"},
         )
         assert isinstance(result, str)
         assert "results_count" in result
-        # At least one result
-        assert "results_count: 0" not in result
 
-    def test_search_by_terms_empty_list(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool("search_docs_by_terms", {"terms": []})
-        assert isinstance(result, str)
-        assert "error" in result
-
-    def test_search_by_terms_no_matches(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool(
-            "search_docs_by_terms", {"terms": ["xyznonexistent123"]},
+    def test_search_files_docs_terms_splits_on_whitespace(self, agent_with_docs: ConcreteAgent) -> None:
+        """Terms mode should split query tokens on spaces and commas."""
+        result = agent_with_docs._search_files(
+            scope="docs",
+            query="installation API,Configuration",
+            mode="terms",
         )
-        assert isinstance(result, str)
-        assert "results_count: 0" in result
+        assert isinstance(result, dict)
+        assert result.get("mode") == "terms"
+        assert result.get("terms") == ["installation", "API", "Configuration"]
 
-    def test_search_by_terms_with_top_n(self, agent_with_docs: ConcreteAgent) -> None:
+    def test_search_files_docs_regex(self, agent_with_docs: ConcreteAgent) -> None:
         result = agent_with_docs._execute_tool(
-            "search_docs_by_terms", {"terms": ["guide", "API"], "top_n": 1},
+            "search_files",
+            {"scope": "docs", "query": r"def \w+\(", "mode": "regex"},
         )
         assert isinstance(result, str)
-
-    # --- search_docs_by_regex ---
-
-    def test_search_by_regex_finds_matches(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool(
-            "search_docs_by_regex", {"pattern": r"def \w+\("},
+        assert "regex" in result.lower()
+
+    def test_search_files_workspace_exact(self, agent: ConcreteAgent) -> None:
+        (agent._workspace.root_path / "output").mkdir(parents=True, exist_ok=True)
+        (agent._workspace.root_path / "output" / "data.txt").write_text("alpha\nbeta\nalpha")
+        result = agent._execute_tool(
+            "search_files",
+            {"scope": "workspace", "query": "alpha", "mode": "exact"},
         )
         assert isinstance(result, str)
-        assert "error: None" in result or "error: null" in result.lower() or "timed_out" in result
-        assert "main.py" in result or "match" in result.lower()
-
-    def test_search_by_regex_empty_pattern(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool("search_docs_by_regex", {"pattern": ""})
-        assert isinstance(result, str)
-        assert "error" in result
-
-    def test_search_by_regex_invalid_pattern(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool("search_docs_by_regex", {"pattern": "[invalid"})
-        assert isinstance(result, str)
-        assert "Invalid regex" in result
+        assert "files_with_matches" in result
 
-    def test_search_by_regex_no_matches(self, agent_with_docs: ConcreteAgent) -> None:
-        result = agent_with_docs._execute_tool(
-            "search_docs_by_regex", {"pattern": "XYZNONEXISTENT123"},
-        )
+    def test_docs_scope_without_loader_returns_error(self, agent: ConcreteAgent) -> None:
+        result = agent._execute_tool("search_files", {"scope": "docs", "query": "x"})
         assert isinstance(result, str)
-        assert "timed_out" in result  # still returns the result structure
-
-    # --- unavailability without loader ---
-
-    def test_docs_tools_unavailable_without_loader(self, agent: ConcreteAgent) -> None:
-        """All docs tools return error dict when executed without a loader (pre-dispatch)."""
-        for tool_name in ["search_docs", "get_doc_file", "search_docs_by_terms", "search_docs_by_regex"]:
-            result = agent._execute_tool(tool_name, {"query": "test"})
-            assert isinstance(result, dict)
-            assert "error" in result
-            assert "not currently available" in result["error"]
+        assert "docs scope unavailable" in result
 
 
 # =============================================================================
@@ -908,7 +1143,7 @@ class TestCallLLM:
     """Tests for _call_llm system prompt injection."""
 
     def test_no_docs_section_without_loader(self, agent: ConcreteAgent) -> None:
-        """System prompt has tool section but no docs section when no documentation loader is present."""
+        """System prompt has tools + workspace sections, but no docs section without loader."""
         mock_response = LLMChatResponse(content="hello", response_id="r1")
         agent.llm_client.call_sync = MagicMock(return_value=mock_response)
 
@@ -918,10 +1153,13 @@ def test_no_docs_section_without_loader(self, agent: ConcreteAgent) -> None:
         system_prompt = call_args.kwargs["system_prompt"]
         assert system_prompt.startswith("base prompt")
         assert "## Tools" in system_prompt  # tool availability section always injected
+        assert "## Workspace" in system_prompt
+        assert "raw/" in system_prompt
+        assert "output/" in system_prompt
         assert "## Documentation" not in system_prompt  # no docs without loader
 
     def test_docs_section_appended_with_loader(self, agent_with_docs: ConcreteAgent) -> None:
-        """System prompt has documentation section appended when loader is present."""
+        """System prompt has workspace and documentation sections when loader is present."""
         mock_response = LLMChatResponse(content="hello", response_id="r1")
         agent_with_docs.llm_client.call_sync = MagicMock(return_value=mock_response)
 
@@ -930,11 +1168,12 @@ def test_docs_section_appended_with_loader(self, agent_with_docs: ConcreteAgent)
         call_args = agent_with_docs.llm_client.call_sync.call_args
         system_prompt = call_args.kwargs["system_prompt"]
         assert system_prompt.startswith("base prompt")
+        assert "## Workspace" in system_prompt
         assert "## Documentation" in system_prompt
         assert "guide.md" in system_prompt
 
     def test_streaming_also_gets_docs_section(self, agent_with_docs: ConcreteAgent) -> None:
-        """Streaming path also appends documentation section."""
+        """Streaming path also appends workspace and documentation sections."""
         agent_with_docs._stream_chunk_callable = MagicMock()
         mock_response = LLMChatResponse(content="hello", response_id="r1")
 
@@ -948,6 +1187,32 @@ def fake_stream(*args, **kwargs):
 
         call_args = agent_with_docs.llm_client.call_stream_sync.call_args
         system_prompt = call_args.kwargs["system_prompt"]
+        assert "## Workspace" in system_prompt
+        assert "## Documentation" in system_prompt
+
+    def test_workspace_section_omitted_without_workspace(self, mock_emit: MagicMock) -> None:
+        agent = ConcreteAgent(emit_message_callable=mock_emit, workspace=None)
+        mock_response = LLMChatResponse(content="hello", response_id="r1")
+        agent.llm_client.call_sync = MagicMock(return_value=mock_response)
+
+        agent._call_llm([], "base prompt")
+
+        call_args = agent.llm_client.call_sync.call_args
+        system_prompt = call_args.kwargs["system_prompt"]
+        assert "## Workspace" not in system_prompt
+
+    def test_docs_section_present_without_workspace_when_loader_exists(
+        self,
+        agent_docs_only: ConcreteAgent,
+    ) -> None:
+        mock_response = LLMChatResponse(content="hello", response_id="r1")
+        agent_docs_only.llm_client.call_sync = MagicMock(return_value=mock_response)
+
+        agent_docs_only._call_llm([], "base prompt")
+
+        call_args = agent_docs_only.llm_client.call_sync.call_args
+        system_prompt = call_args.kwargs["system_prompt"]
+        assert "## Workspace" not in system_prompt
         assert "## Documentation" in system_prompt
 
 
@@ -1514,6 +1779,15 @@ class FakeAgent:
         fake._feature_flag = True
         assert avail_fn(fake) is True
 
+    def test_token_optimized_flag_encodes_output(self) -> None:
+        @agent_tool(token_optimized=True)
+        def _encoded_tool(self) -> dict[str, Any]:
+            """Returns a dict that should be toon-encoded."""
+            return {"status": "ok", "count": 2}
+
+        meta = _encoded_tool._tool_meta
+        assert meta.token_optimized is True
+
     # ---- _tool_meta is attached to the method ----
 
     def test_tool_meta_attached_to_method(self) -> None:
@@ -1704,6 +1978,9 @@ def test_stores_fields(self) -> None:
         assert meta.description == "desc"
         assert meta.parameters == params
         assert meta.availability is True
+        assert meta.persist == ToolResultPersistMode.NEVER
+        assert meta.max_characters == 10_000
+        assert meta.token_optimized is False
 
 
 # =============================================================================
@@ -1737,17 +2014,20 @@ def test_process_message_with_tool_call_round_trip(
         ]
         assert any("Echo result" in m[0][0].content for m in chat_msgs)
 
-    def test_docs_tools_functional_with_search_then_read(
+    def test_file_tools_functional_with_search_then_read(
         self, agent_with_docs: ConcreteAgent,
     ) -> None:
         """Search for content, then read the file — mimics typical docs workflow."""
-        # Search for something — @token_optimized returns toon-encoded string
-        search_result = agent_with_docs._execute_tool("search_docs", {"query": "pip install"})
+        search_result = agent_with_docs._execute_tool(
+            "search_files",
+            {"scope": "docs", "query": "pip install", "mode": "exact"},
+        )
         assert isinstance(search_result, str)
         assert "files_with_matches" in search_result
 
-        # Read the file that contains "pip install" — also returns toon-encoded string
-        read_result = agent_with_docs._execute_tool("get_doc_file", {"path": "guide.md"})
+        read_result = agent_with_docs._execute_tool(
+            "read_file", {"scope": "docs", "path": "guide.md"},
+        )
         assert isinstance(read_result, str)
         assert "pip install" in read_result
 
@@ -1762,5 +2042,5 @@ def test_reset_preserves_documentation_data_loader(
 
         assert agent_with_docs._documentation_data_loader is not None
         assert len(agent_with_docs.get_chats()) == 0
-        # Docs tools still registered
-        assert "search_docs" in agent_with_docs._registered_tool_names
+        # Unified file tools stay registered
+        assert "search_files" in agent_with_docs._registered_tool_names
diff --git a/tests/unit/agents/test_abstract_agent_helpers.py b/tests/unit/agents/test_abstract_agent_helpers.py
new file mode 100644
index 00000000..e9a994f3
--- /dev/null
+++ b/tests/unit/agents/test_abstract_agent_helpers.py
@@ -0,0 +1,204 @@
+"""
+tests/unit/agents/test_abstract_agent_helpers.py
+
+Unit tests for module-level helper functions in abstract_agent.py:
+  - _serialize_tool_result
+  - _normalize_file_scope
+  - _parse_search_terms
+"""
+
+import json
+from datetime import datetime
+from typing import Any
+
+import pytest
+
+from bluebox.agents.abstract_agent import (
+    _normalize_file_scope,
+    _parse_search_terms,
+    _serialize_tool_result,
+)
+
+
+# =============================================================================
+# _serialize_tool_result
+# =============================================================================
+
+
+class TestSerializeToolResult:
+    """Tests for _serialize_tool_result."""
+
+    def test_dict_returns_json(self) -> None:
+        result = {"key": "value", "count": 42}
+        serialized, content_type = _serialize_tool_result(result)
+        assert content_type == "json"
+        assert json.loads(serialized) == result
+
+    def test_list_returns_json(self) -> None:
+        result = [1, 2, 3]
+        serialized, content_type = _serialize_tool_result(result)
+        assert content_type == "json"
+        assert json.loads(serialized) == result
+
+    def test_string_returns_json(self) -> None:
+        result = "hello"
+        serialized, content_type = _serialize_tool_result(result)
+        assert content_type == "json"
+        assert json.loads(serialized) == result
+
+    def test_nested_dict_returns_json(self) -> None:
+        result = {"a": {"b": [1, 2]}, "c": None}
+        serialized, content_type = _serialize_tool_result(result)
+        assert content_type == "json"
+        assert json.loads(serialized) == result
+
+    def test_none_returns_json(self) -> None:
+        serialized, content_type = _serialize_tool_result(None)
+        assert content_type == "json"
+        assert json.loads(serialized) is None
+
+    def test_non_ascii_preserved(self) -> None:
+        result = {"emoji": "🔥", "text": "café"}
+        serialized, content_type = _serialize_tool_result(result)
+        assert content_type == "json"
+        assert "🔥" in serialized
+        assert "café" in serialized
+
+    def test_datetime_uses_default_str(self) -> None:
+        dt = datetime(2025, 1, 15, 12, 30, 0)
+        result = {"timestamp": dt}
+        serialized, content_type = _serialize_tool_result(result)
+        assert content_type == "json"
+        parsed = json.loads(serialized)
+        assert "2025-01-15" in parsed["timestamp"]
+
+    def test_non_serializable_falls_back_to_text(self) -> None:
+        # An object whose __str__ works but json.dumps with default=str
+        # should still handle it — need something truly unserializable.
+        # Actually, default=str handles most things. Let's verify str fallback
+        # by using an object that raises in __repr__/__str__ during json encoding.
+
+        class BadObj:
+            def __repr__(self) -> str:
+                return "BadObj()"
+
+        # default=str calls str() on non-serializable, so this should still
+        # produce json via the default handler
+        result = {"obj": BadObj()}
+        serialized, content_type = _serialize_tool_result(result)
+        assert content_type == "json"
+        assert "BadObj()" in serialized
+
+    def test_integer_returns_json(self) -> None:
+        serialized, content_type = _serialize_tool_result(42)
+        assert content_type == "json"
+        assert json.loads(serialized) == 42
+
+    def test_boolean_returns_json(self) -> None:
+        serialized, content_type = _serialize_tool_result(True)
+        assert content_type == "json"
+        assert json.loads(serialized) is True
+
+    def test_empty_dict_returns_json(self) -> None:
+        serialized, content_type = _serialize_tool_result({})
+        assert content_type == "json"
+        assert json.loads(serialized) == {}
+
+
+# =============================================================================
+# _normalize_file_scope
+# =============================================================================
+
+
+class TestNormalizeFileScope:
+    """Tests for _normalize_file_scope."""
+
+    def test_workspace_lowercase(self) -> None:
+        assert _normalize_file_scope("workspace") == "workspace"
+
+    def test_docs_lowercase(self) -> None:
+        assert _normalize_file_scope("docs") == "docs"
+
+    def test_uppercase_normalized(self) -> None:
+        assert _normalize_file_scope("WORKSPACE") == "workspace"
+        assert _normalize_file_scope("DOCS") == "docs"
+
+    def test_mixed_case_normalized(self) -> None:
+        assert _normalize_file_scope("Workspace") == "workspace"
+        assert _normalize_file_scope("Docs") == "docs"
+
+    def test_leading_trailing_whitespace_stripped(self) -> None:
+        assert _normalize_file_scope("  workspace  ") == "workspace"
+        assert _normalize_file_scope("\tdocs\n") == "docs"
+
+    def test_invalid_scope_raises_value_error(self) -> None:
+        with pytest.raises(ValueError, match="scope must be 'workspace' or 'docs'"):
+            _normalize_file_scope("files")
+
+    def test_empty_string_raises_value_error(self) -> None:
+        with pytest.raises(ValueError, match="scope must be 'workspace' or 'docs'"):
+            _normalize_file_scope("")
+
+    def test_whitespace_only_raises_value_error(self) -> None:
+        with pytest.raises(ValueError, match="scope must be 'workspace' or 'docs'"):
+            _normalize_file_scope("   ")
+
+    def test_partial_match_raises_value_error(self) -> None:
+        with pytest.raises(ValueError):
+            _normalize_file_scope("work")
+
+    def test_extra_word_raises_value_error(self) -> None:
+        with pytest.raises(ValueError):
+            _normalize_file_scope("workspace docs")
+
+
+# =============================================================================
+# _parse_search_terms
+# =============================================================================
+
+
+class TestParseSearchTerms:
+    """Tests for _parse_search_terms."""
+
+    def test_single_term(self) -> None:
+        assert _parse_search_terms("hello") == ["hello"]
+
+    def test_multiple_space_separated_terms(self) -> None:
+        assert _parse_search_terms("foo bar baz") == ["foo", "bar", "baz"]
+
+    def test_comma_separated_terms(self) -> None:
+        assert _parse_search_terms("foo,bar,baz") == ["foo", "bar", "baz"]
+
+    def test_mixed_separators(self) -> None:
+        assert _parse_search_terms("foo, bar  baz,,qux") == ["foo", "bar", "baz", "qux"]
+
+    def test_duplicates_removed_preserving_order(self) -> None:
+        assert _parse_search_terms("foo bar foo baz bar") == ["foo", "bar", "baz"]
+
+    def test_empty_string_returns_empty_list(self) -> None:
+        assert _parse_search_terms("") == []
+
+    def test_whitespace_only_returns_empty_list(self) -> None:
+        assert _parse_search_terms("   ") == []
+
+    def test_commas_only_returns_empty_list(self) -> None:
+        assert _parse_search_terms(",,,") == []
+
+    def test_leading_trailing_whitespace_in_terms(self) -> None:
+        # The regex split + strip should handle this
+        assert _parse_search_terms("  foo  ,  bar  ") == ["foo", "bar"]
+
+    def test_tab_separated(self) -> None:
+        assert _parse_search_terms("foo\tbar\tbaz") == ["foo", "bar", "baz"]
+
+    def test_newline_separated(self) -> None:
+        assert _parse_search_terms("foo\nbar\nbaz") == ["foo", "bar", "baz"]
+
+    def test_single_character_terms(self) -> None:
+        assert _parse_search_terms("a b c") == ["a", "b", "c"]
+
+    def test_preserves_case(self) -> None:
+        assert _parse_search_terms("Foo BAR baz") == ["Foo", "BAR", "baz"]
+
+    def test_special_characters_preserved(self) -> None:
+        assert _parse_search_terms("foo-bar baz_qux") == ["foo-bar", "baz_qux"]
diff --git a/tests/unit/agents/test_bluebox_agent_context.py b/tests/unit/agents/test_bluebox_agent_context.py
index 95db05b8..aa294017 100644
--- a/tests/unit/agents/test_bluebox_agent_context.py
+++ b/tests/unit/agents/test_bluebox_agent_context.py
@@ -15,7 +15,7 @@
 import pytest
 
 from bluebox.agents.bluebox_agent import BlueBoxAgent
-from bluebox.agents.workspace import LocalWorkspace
+from bluebox.workspace import LocalAgentWorkspace
 from bluebox.data_models.agents.context import BlueBoxAgentContext, UsedRoutine, UsedRoutineParameter
 
 
@@ -44,7 +44,7 @@ def sample_context() -> BlueBoxAgentContext:
         ],
         python_code=(
             'import csv\n'
-            'with open("outputs/trains.csv", "w") as f:\n'
+            'with open("output/trains.csv", "w") as f:\n'
             '    writer = csv.DictWriter(f, fieldnames=["departure", "price"])\n'
             '    writer.writeheader()\n'
             '    for rr in routine_results:\n'
@@ -52,7 +52,7 @@ def sample_context() -> BlueBoxAgentContext:
             '            writer.writerow(train)\n'
             'print("Done")'
         ),
-        output_files=["outputs/trains.csv"],
+        output_files=["output/trains.csv"],
         output_description="CSV with columns: departure, price. 12 rows of Amtrak trains under $100.",
         summary="Searched Amtrak for NYC-Boston trains on March 15, filtered by price, and exported to CSV.",
         generated_at=datetime(2026, 2, 22, 10, 30, 0, tzinfo=timezone.utc),
@@ -190,7 +190,7 @@ def _make_agent(
         """Create a BlueBoxAgent with mocked dependencies."""
         return BlueBoxAgent(
             emit_message_callable=MagicMock(),
-            workspace=LocalWorkspace(str(workspace_dir)),
+            workspace=LocalAgentWorkspace(str(workspace_dir)),
             auth_headers_provider=lambda: {"X-Service-Token": "test"},
             context_file=context_file,
         )
@@ -295,7 +295,7 @@ def _make_agent(self, tmp_path: Path, context: BlueBoxAgentContext) -> BlueBoxAg
 
         return BlueBoxAgent(
             emit_message_callable=MagicMock(),
-            workspace=LocalWorkspace(str(tmp_path)),
+            workspace=LocalAgentWorkspace(str(tmp_path)),
             auth_headers_provider=lambda: {"X-Service-Token": "test"},
             context_file=str(ctx_file),
         )
@@ -326,12 +326,12 @@ def test_context_section_truncation(self, tmp_path: Path) -> None:
         section = agent._get_context_prompt_section()
         assert len(section) < 25_000
         assert "context truncated" in section
-        assert "read_workspace_file" in section
+        assert "read_file(scope=\"workspace\"" in section
 
     def test_no_context_no_section(self, tmp_path: Path) -> None:
         agent = BlueBoxAgent(
             emit_message_callable=MagicMock(),
-            workspace=LocalWorkspace(str(tmp_path)),
+            workspace=LocalAgentWorkspace(str(tmp_path)),
             auth_headers_provider=lambda: {"X-Service-Token": "test"},
         )
         prompt = agent._get_system_prompt()
@@ -349,7 +349,7 @@ class TestGenerateContext:
     def _make_agent(self, tmp_path: Path) -> BlueBoxAgent:
         return BlueBoxAgent(
             emit_message_callable=MagicMock(),
-            workspace=LocalWorkspace(str(tmp_path)),
+            workspace=LocalAgentWorkspace(str(tmp_path)),
             auth_headers_provider=lambda: {"X-Service-Token": "test"},
         )
 
@@ -410,16 +410,18 @@ def test_auto_populates_routines_from_raw(self, tmp_path: Path) -> None:
         """When LLM returns empty routines_used, auto-populate from raw/."""
         agent = self._make_agent(tmp_path)
 
-        # Write a fake routine result to raw/
-        raw_dir = tmp_path / "raw"
-        raw_dir.mkdir(exist_ok=True)
-        (raw_dir / "result_1.json").write_text(json.dumps({
-            "routine_id": "Routine_abc",
-            "routine_name": "TestRoutine",
-            "status": "completed",
-            "parameters": {"city": "NYC"},
-            "result": {"ok": True, "data": {}},
-        }))
+        # Write a fake routine result through workspace APIs
+        agent._workspace.save_artifact(  # noqa: SLF001
+            "raw",
+            "result_1.json",
+            json.dumps({
+                "routine_id": "Routine_abc",
+                "routine_name": "TestRoutine",
+                "status": "completed",
+                "parameters": {"city": "NYC"},
+                "result": {"ok": True, "data": {}},
+            }),
+        )
 
         # LLM returns context with empty routines_used
         context_from_llm = BlueBoxAgentContext(
@@ -441,16 +443,18 @@ def test_auto_populate_deduplicates_routines(self, tmp_path: Path) -> None:
         """Same routine_id executed multiple times should appear once."""
         agent = self._make_agent(tmp_path)
 
-        raw_dir = tmp_path / "raw"
-        raw_dir.mkdir(exist_ok=True)
         for i in range(3):
-            (raw_dir / f"result_{i}.json").write_text(json.dumps({
-                "routine_id": "Routine_same",
-                "routine_name": "SameRoutine",
-                "status": "completed",
-                "parameters": {"q": f"query_{i}"},
-                "result": {"ok": True, "data": {}},
-            }))
+            agent._workspace.save_artifact(  # noqa: SLF001
+                "raw",
+                f"result_{i}.json",
+                json.dumps({
+                    "routine_id": "Routine_same",
+                    "routine_name": "SameRoutine",
+                    "status": "completed",
+                    "parameters": {"q": f"query_{i}"},
+                    "result": {"ok": True, "data": {}},
+                }),
+            )
 
         context_from_llm = BlueBoxAgentContext(
             goal="test", summary="test", output_description="test",
@@ -465,15 +469,17 @@ def test_llm_provided_routines_not_overridden(self, tmp_path: Path) -> None:
         """When LLM provides routines_used, don't auto-populate from raw/."""
         agent = self._make_agent(tmp_path)
 
-        raw_dir = tmp_path / "raw"
-        raw_dir.mkdir(exist_ok=True)
-        (raw_dir / "result_1.json").write_text(json.dumps({
-            "routine_id": "Routine_from_raw",
-            "routine_name": "RawRoutine",
-            "status": "completed",
-            "parameters": {},
-            "result": {"ok": True, "data": {}},
-        }))
+        agent._workspace.save_artifact(  # noqa: SLF001
+            "raw",
+            "result_1.json",
+            json.dumps({
+                "routine_id": "Routine_from_raw",
+                "routine_name": "RawRoutine",
+                "status": "completed",
+                "parameters": {},
+                "result": {"ok": True, "data": {}},
+            }),
+        )
 
         context_from_llm = BlueBoxAgentContext(
             goal="test", summary="test", output_description="test",
diff --git a/tests/unit/agents/test_bluebox_agent_execution_counter.py b/tests/unit/agents/test_bluebox_agent_execution_counter.py
index 53bcc3e0..edc0bc2d 100644
--- a/tests/unit/agents/test_bluebox_agent_execution_counter.py
+++ b/tests/unit/agents/test_bluebox_agent_execution_counter.py
@@ -15,7 +15,7 @@
 import requests
 
 from bluebox.agents.bluebox_agent import BlueBoxAgent
-from bluebox.agents.workspace import LocalWorkspace
+from bluebox.workspace import LocalAgentWorkspace
 
 
 # =============================================================================
@@ -27,7 +27,7 @@ def _make_agent(tmp_path: Path) -> BlueBoxAgent:
     """Create a BlueBoxAgent with a local workspace for testing."""
     return BlueBoxAgent(
         emit_message_callable=MagicMock(),
-        workspace=LocalWorkspace(str(tmp_path)),
+        workspace=LocalAgentWorkspace(str(tmp_path)),
         auth_headers_provider=lambda: {"X-Service-Token": "test"},
     )
 
diff --git a/tests/unit/data_loaders/test_dom_data_loader.py b/tests/unit/data_loaders/test_dom_data_loader.py
new file mode 100644
index 00000000..940810c7
--- /dev/null
+++ b/tests/unit/data_loaders/test_dom_data_loader.py
@@ -0,0 +1,758 @@
+"""
+tests/unit/data_loaders/test_dom_data_loader.py
+
+Unit tests for DOMDataLoader and related classes.
+
+Test data (dom_basic.jsonl) contains 4 snapshots:
+  [0] https://www.example.com/         — home page (form with email input, submit, links)
+  [1] https://www.example.com/search   — search page (form with text input, select, button)
+  [2] https://www.example.com/search?q=laptops — results page (table, link, button)
+  [3] https://auth.example.com/login   — login page (form with username + password inputs, button, link)
+"""
+
+from pathlib import Path
+
+import pytest
+
+from bluebox.llms.data_loaders.dom_data_loader import (
+    DOMDataLoader,
+    DOMStats,
+)
+
+
+# --- Fixtures ---
+
+
+@pytest.fixture(scope="module")
+def dom_events_dir(tests_root: Path) -> Path:
+    """Directory containing DOM event test JSONL files."""
+    return tests_root / "data" / "input" / "dom_events"
+
+
+@pytest.fixture
+def basic_loader(dom_events_dir: Path) -> DOMDataLoader:
+    """DOMDataLoader loaded from basic test data (4 snapshots)."""
+    return DOMDataLoader(str(dom_events_dir / "dom_basic.jsonl"))
+
+
+@pytest.fixture
+def malformed_loader(dom_events_dir: Path) -> DOMDataLoader:
+    """DOMDataLoader loaded from malformed test data (should skip bad lines)."""
+    return DOMDataLoader(str(dom_events_dir / "dom_malformed.jsonl"))
+
+
+# --- DOMStats Tests ---
+
+
+class TestDOMStats:
+    """Tests for DOMStats dataclass."""
+
+    def test_to_summary_basic(self) -> None:
+        """Generate summary from basic stats."""
+        stats = DOMStats(
+            total_snapshots=3,
+            unique_urls=2,
+            unique_titles=2,
+            urls=["https://example.com/", "https://example.com/search"],
+            titles=["Home", "Search"],
+            avg_string_count=50.0,
+            avg_document_count=1.5,
+            total_strings=150,
+            hosts={"example.com": 3},
+        )
+        summary = stats.to_summary()
+        assert "Total Snapshots: 3" in summary
+        assert "Unique URLs: 2" in summary
+        assert "Total Strings Across All Snapshots: 150" in summary
+        assert "example.com" in summary
+
+    def test_to_summary_zero_values(self) -> None:
+        """Generate summary with zero values."""
+        stats = DOMStats()
+        summary = stats.to_summary()
+        assert "Total Snapshots: 0" in summary
+        assert "Unique URLs: 0" in summary
+
+
+# --- Initialization Tests ---
+
+
+class TestDOMDataLoaderInit:
+    """Tests for DOMDataLoader initialization."""
+
+    def test_init_basic_file(self, basic_loader: DOMDataLoader) -> None:
+        """Initialize from basic JSONL file with 4 snapshots."""
+        assert len(basic_loader.entries) == 4
+
+    def test_init_file_not_found(self, dom_events_dir: Path) -> None:
+        """Raise FileNotFoundError when file doesn't exist."""
+        with pytest.raises(FileNotFoundError):
+            DOMDataLoader(str(dom_events_dir / "nonexistent.jsonl"))
+
+    def test_init_empty_file(self, dom_events_dir: Path) -> None:
+        """Initialize from empty file produces empty loader."""
+        loader = DOMDataLoader(str(dom_events_dir / "dom_empty.jsonl"))
+        assert len(loader.entries) == 0
+
+    def test_init_malformed_skips_bad_lines(self, malformed_loader: DOMDataLoader) -> None:
+        """Malformed lines are skipped, valid entries are loaded."""
+        assert len(malformed_loader.entries) == 2
+        urls = [e.url for e in malformed_loader.entries]
+        assert "https://good.example.com/" in urls
+        assert "https://good.example.com/page2" in urls
+
+
+# --- Stats Tests ---
+
+
+class TestDOMDataLoaderStats:
+    """Tests for DOMDataLoader computed stats."""
+
+    def test_stats_type(self, basic_loader: DOMDataLoader) -> None:
+        """stats property returns DOMStats instance."""
+        assert isinstance(basic_loader.stats, DOMStats)
+
+    def test_stats_total_snapshots(self, basic_loader: DOMDataLoader) -> None:
+        """Stats correctly counts total snapshots."""
+        assert basic_loader.stats.total_snapshots == 4
+
+    def test_stats_unique_urls(self, basic_loader: DOMDataLoader) -> None:
+        """Stats correctly counts unique URLs."""
+        assert basic_loader.stats.unique_urls == 4
+
+    def test_stats_hosts(self, basic_loader: DOMDataLoader) -> None:
+        """Stats correctly counts per-host snapshot counts."""
+        hosts = basic_loader.stats.hosts
+        assert hosts["www.example.com"] == 3
+        assert hosts["auth.example.com"] == 1
+
+    def test_stats_total_strings(self, basic_loader: DOMDataLoader) -> None:
+        """Stats computes total strings across all snapshots."""
+        total = sum(len(e.strings) for e in basic_loader.entries)
+        assert basic_loader.stats.total_strings == total
+
+    def test_empty_stats(self, dom_events_dir: Path) -> None:
+        """Empty loader has zero stats."""
+        loader = DOMDataLoader(str(dom_events_dir / "dom_empty.jsonl"))
+        assert loader.stats.total_snapshots == 0
+
+
+# --- Snapshot Retrieval Tests ---
+
+
+class TestSnapshotRetrieval:
+    """Tests for snapshot retrieval methods."""
+
+    def test_get_snapshot_valid(self, basic_loader: DOMDataLoader) -> None:
+        """Get snapshot by valid index."""
+        snapshot = basic_loader.get_snapshot(0)
+        assert snapshot is not None
+        assert snapshot.title == "Example Home"
+
+    def test_get_snapshot_out_of_range(self, basic_loader: DOMDataLoader) -> None:
+        """Return None for out-of-range index."""
+        assert basic_loader.get_snapshot(99) is None
+        assert basic_loader.get_snapshot(-1) is None
+
+    def test_get_snapshots_by_url(self, basic_loader: DOMDataLoader) -> None:
+        """Get snapshots by URL substring."""
+        results = basic_loader.get_snapshots_by_url("search")
+        assert len(results) == 2
+
+    def test_get_snapshots_by_url_pattern(self, basic_loader: DOMDataLoader) -> None:
+        """Get snapshots by URL glob pattern."""
+        results = basic_loader.get_snapshots_by_url_pattern("*example.com/search*")
+        assert len(results) == 2
+
+    def test_get_snapshots_by_host(self, basic_loader: DOMDataLoader) -> None:
+        """Get snapshots by host."""
+        results = basic_loader.get_snapshots_by_host("auth.example.com")
+        assert len(results) == 1
+        assert results[0].title == "Login — Example"
+
+
+# --- Page Summary Tests ---
+
+
+class TestPageSummaries:
+    """Tests for page-level summary methods."""
+
+    def test_list_pages(self, basic_loader: DOMDataLoader) -> None:
+        """List all pages with summary info."""
+        pages = basic_loader.list_pages()
+        assert len(pages) == 4
+        assert all(k in pages[0] for k in ["index", "url", "title", "string_count", "document_count", "timestamp"])
+
+    def test_get_page_titles(self, basic_loader: DOMDataLoader) -> None:
+        """Get all page titles."""
+        titles = basic_loader.get_page_titles()
+        assert len(titles) == 4
+        assert titles[0]["title"] == "Example Home"
+
+
+# --- String Table Analysis Tests ---
+
+
+class TestStringTableAnalysis:
+    """Tests for string table analysis methods."""
+
+    def test_get_text_content_valid(self, basic_loader: DOMDataLoader) -> None:
+        """Get text content from a valid snapshot."""
+        content = basic_loader.get_text_content(0)
+        assert content is not None
+        assert "Welcome to Example" in content
+
+    def test_get_text_content_invalid_index(self, basic_loader: DOMDataLoader) -> None:
+        """Return None for invalid index."""
+        assert basic_loader.get_text_content(99) is None
+
+    def test_get_text_content_truncation(self, basic_loader: DOMDataLoader) -> None:
+        """Content is truncated at max_chars."""
+        content = basic_loader.get_text_content(0, max_chars=20)
+        assert content is not None
+        assert content.endswith("...")
+
+    def test_search_strings_found(self, basic_loader: DOMDataLoader) -> None:
+        """Search strings finds matching content."""
+        results = basic_loader.search_strings("example")
+        assert len(results) > 0
+
+    def test_search_strings_case_insensitive(self, basic_loader: DOMDataLoader) -> None:
+        """String search is case-insensitive by default."""
+        results_lower = basic_loader.search_strings("example")
+        results_upper = basic_loader.search_strings("EXAMPLE")
+        assert len(results_lower) == len(results_upper)
+
+    def test_search_strings_specific_snapshot(self, basic_loader: DOMDataLoader) -> None:
+        """Search strings in a specific snapshot only."""
+        results = basic_loader.search_strings("laptops", snapshot_index=2)
+        assert len(results) == 1
+        assert results[0]["snapshot_index"] == 2
+
+    def test_search_strings_no_match(self, basic_loader: DOMDataLoader) -> None:
+        """Return empty list when string not found."""
+        assert basic_loader.search_strings("xyznonexistent") == []
+
+    def test_search_strings_empty_value(self, basic_loader: DOMDataLoader) -> None:
+        """Return empty list for empty search value."""
+        assert basic_loader.search_strings("") == []
+
+
+# --- Input Elements Tests ---
+
+
+class TestGetInputs:
+    """Tests for get_inputs method (INPUT, SELECT, TEXTAREA)."""
+
+    def test_home_page_inputs(self, basic_loader: DOMDataLoader) -> None:
+        """Home page has email input and hidden + submit inputs."""
+        results = basic_loader.get_inputs(snapshot_index=0)
+        assert len(results) == 1
+        elements = results[0]["elements"]
+        tags = [e["tag"] for e in elements]
+        assert "INPUT" in tags
+
+    def test_home_page_email_input_attrs(self, basic_loader: DOMDataLoader) -> None:
+        """Home page email input has correct attributes."""
+        results = basic_loader.get_inputs(snapshot_index=0)
+        elements = results[0]["elements"]
+        email_inputs = [e for e in elements if e["attrs"].get("type") == "email"]
+        assert len(email_inputs) == 1
+        assert email_inputs[0]["attrs"]["name"] == "email_field"
+        assert email_inputs[0]["attrs"]["placeholder"] == "Enter your email"
+
+    def test_search_page_has_text_input_and_select(self, basic_loader: DOMDataLoader) -> None:
+        """Search page has a text input and a select dropdown."""
+        results = basic_loader.get_inputs(snapshot_index=1)
+        elements = results[0]["elements"]
+        tags = [e["tag"] for e in elements]
+        assert "INPUT" in tags
+        assert "SELECT" in tags
+
+    def test_login_page_has_username_and_password(self, basic_loader: DOMDataLoader) -> None:
+        """Login page has username and password inputs."""
+        results = basic_loader.get_inputs(snapshot_index=3)
+        elements = results[0]["elements"]
+        names = [e["attrs"].get("name") for e in elements]
+        assert "username" in names
+        assert "password" in names
+
+    def test_input_value_captured(self, basic_loader: DOMDataLoader) -> None:
+        """Input elements have inputValue when present."""
+        results = basic_loader.get_inputs(snapshot_index=0)
+        elements = results[0]["elements"]
+        # The home page email input has an inputValue
+        inputs_with_value = [e for e in elements if "input_value" in e]
+        assert len(inputs_with_value) >= 1
+
+    def test_results_page_no_inputs(self, basic_loader: DOMDataLoader) -> None:
+        """Results page has no input elements."""
+        results = basic_loader.get_inputs(snapshot_index=2)
+        assert results == []
+
+    def test_all_snapshots(self, basic_loader: DOMDataLoader) -> None:
+        """Get inputs across all snapshots."""
+        results = basic_loader.get_inputs()
+        # Home, search, and login pages have inputs; results page does not
+        assert len(results) == 3
+
+
+# --- Button Tests ---
+
+
+class TestGetButtons:
+    """Tests for get_buttons method."""
+
+    def test_search_page_has_button(self, basic_loader: DOMDataLoader) -> None:
+        """Search page has a search button."""
+        results = basic_loader.get_buttons(snapshot_index=1)
+        assert len(results) == 1
+        elements = results[0]["elements"]
+        assert any(e["tag"] == "BUTTON" for e in elements)
+
+    def test_home_page_submit_input_counted_as_button(self, basic_loader: DOMDataLoader) -> None:
+        """Home page has <input type='submit'> which counts as a button."""
+        results = basic_loader.get_buttons(snapshot_index=0)
+        assert len(results) == 1
+        elements = results[0]["elements"]
+        submit_inputs = [e for e in elements if e["attrs"].get("type") == "submit"]
+        assert len(submit_inputs) == 1
+
+    def test_login_page_has_button(self, basic_loader: DOMDataLoader) -> None:
+        """Login page has a Sign In button."""
+        results = basic_loader.get_buttons(snapshot_index=3)
+        assert len(results) == 1
+
+    def test_results_page_has_next_button(self, basic_loader: DOMDataLoader) -> None:
+        """Results page has a Next Page button."""
+        results = basic_loader.get_buttons(snapshot_index=2)
+        assert len(results) == 1
+
+
+# --- Link Tests ---
+
+
+class TestGetLinks:
+    """Tests for get_links method."""
+
+    def test_home_page_has_links(self, basic_loader: DOMDataLoader) -> None:
+        """Home page has navigation links."""
+        results = basic_loader.get_links(snapshot_index=0)
+        assert len(results) == 1
+        elements = results[0]["elements"]
+        hrefs = [e["attrs"].get("href") for e in elements]
+        assert "https://www.example.com/about" in hrefs
+        assert "/login" in hrefs
+
+    def test_results_page_has_product_link(self, basic_loader: DOMDataLoader) -> None:
+        """Results page has product links."""
+        results = basic_loader.get_links(snapshot_index=2)
+        assert len(results) == 1
+        elements = results[0]["elements"]
+        hrefs = [e["attrs"].get("href") for e in elements]
+        assert any("/product/" in h for h in hrefs if h)
+
+    def test_login_page_has_link(self, basic_loader: DOMDataLoader) -> None:
+        """Login page has a link (forgot password)."""
+        results = basic_loader.get_links(snapshot_index=3)
+        assert len(results) == 1
+        elements = results[0]["elements"]
+        assert len(elements) >= 1
+        assert elements[0]["tag"] == "A"
+
+    def test_all_snapshots(self, basic_loader: DOMDataLoader) -> None:
+        """Get links across all snapshots."""
+        results = basic_loader.get_links()
+        assert len(results) >= 3  # home, results, login all have links
+
+
+# --- Form Tests ---
+
+
+class TestGetForms:
+    """Tests for get_forms method (forms with child inputs)."""
+
+    def test_home_page_form(self, basic_loader: DOMDataLoader) -> None:
+        """Home page has a signup form with email input."""
+        results = basic_loader.get_forms(snapshot_index=0)
+        assert len(results) == 1
+        forms = results[0]["forms"]
+        assert len(forms) == 1
+        form = forms[0]
+        assert form["attrs"].get("id") == "signup-form"
+
+    def test_home_page_form_has_child_inputs(self, basic_loader: DOMDataLoader) -> None:
+        """Home page form contains its child input elements."""
+        results = basic_loader.get_forms(snapshot_index=0)
+        form = results[0]["forms"][0]
+        assert len(form["inputs"]) >= 1
+        tags = [inp["tag"] for inp in form["inputs"]]
+        assert "INPUT" in tags
+
+    def test_search_page_form(self, basic_loader: DOMDataLoader) -> None:
+        """Search page has a form with action and method."""
+        results = basic_loader.get_forms(snapshot_index=1)
+        assert len(results) == 1
+        form = results[0]["forms"][0]
+        assert form["attrs"].get("action") == "/api/search"
+        assert form["attrs"].get("method") == "GET"
+
+    def test_search_page_form_has_input_and_select(self, basic_loader: DOMDataLoader) -> None:
+        """Search form contains text input, select, and button."""
+        results = basic_loader.get_forms(snapshot_index=1)
+        form = results[0]["forms"][0]
+        tags = [inp["tag"] for inp in form["inputs"]]
+        assert "INPUT" in tags
+        assert "SELECT" in tags
+        assert "BUTTON" in tags
+
+    def test_login_page_form(self, basic_loader: DOMDataLoader) -> None:
+        """Login page has a POST form to /api/auth/login."""
+        results = basic_loader.get_forms(snapshot_index=3)
+        assert len(results) == 1
+        form = results[0]["forms"][0]
+        assert form["attrs"]["action"] == "/api/auth/login"
+        assert form["attrs"]["method"] == "POST"
+
+    def test_login_form_has_username_and_password(self, basic_loader: DOMDataLoader) -> None:
+        """Login form has username and password fields."""
+        results = basic_loader.get_forms(snapshot_index=3)
+        form = results[0]["forms"][0]
+        input_names = [inp["attrs"].get("name") for inp in form["inputs"]]
+        assert "username" in input_names
+        assert "password" in input_names
+
+    def test_results_page_no_forms(self, basic_loader: DOMDataLoader) -> None:
+        """Results page has no forms."""
+        results = basic_loader.get_forms(snapshot_index=2)
+        assert results == []
+
+
+# --- Table Tests ---
+
+
+class TestGetTables:
+    """Tests for get_tables method."""
+
+    def test_results_page_has_table(self, basic_loader: DOMDataLoader) -> None:
+        """Results page has a results table."""
+        results = basic_loader.get_tables(snapshot_index=2)
+        assert len(results) == 1
+        tables = results[0]["tables"]
+        assert len(tables) == 1
+
+    def test_results_table_attrs(self, basic_loader: DOMDataLoader) -> None:
+        """Results table has correct attributes."""
+        results = basic_loader.get_tables(snapshot_index=2)
+        table = results[0]["tables"][0]
+        assert table["attrs"].get("class") == "results-table"
+
+    def test_home_page_no_tables(self, basic_loader: DOMDataLoader) -> None:
+        """Home page has no tables."""
+        results = basic_loader.get_tables(snapshot_index=0)
+        assert results == []
+
+    def test_all_snapshots(self, basic_loader: DOMDataLoader) -> None:
+        """Only results page has tables."""
+        results = basic_loader.get_tables()
+        assert len(results) == 1
+        assert results[0]["snapshot_index"] == 2
+
+
+# --- Heading Tests ---
+
+
+class TestGetHeadings:
+    """Tests for get_headings method."""
+
+    def test_home_page_heading(self, basic_loader: DOMDataLoader) -> None:
+        """Home page has H1 heading."""
+        results = basic_loader.get_headings(snapshot_index=0)
+        assert len(results) == 1
+        headings = results[0]["headings"]
+        assert any(h["tag"] == "H1" for h in headings)
+
+    def test_heading_text_content(self, basic_loader: DOMDataLoader) -> None:
+        """Heading has text content extracted from child text nodes."""
+        results = basic_loader.get_headings(snapshot_index=0)
+        headings = results[0]["headings"]
+        h1 = [h for h in headings if h["tag"] == "H1"][0]
+        assert h1["text"] is not None
+        assert "Welcome to Example" in h1["text"]
+
+    def test_all_pages_have_headings(self, basic_loader: DOMDataLoader) -> None:
+        """All 4 pages have at least one heading."""
+        results = basic_loader.get_headings()
+        assert len(results) == 4
+
+
+# --- Clickable Elements Tests ---
+
+
+class TestGetClickableElements:
+    """Tests for get_clickable_elements method."""
+
+    def test_home_page_clickable(self, basic_loader: DOMDataLoader) -> None:
+        """Home page has clickable elements (input, button, links)."""
+        results = basic_loader.get_clickable_elements(snapshot_index=0)
+        assert len(results) == 1
+        elements = results[0]["elements"]
+        assert len(elements) > 0
+
+    def test_login_page_clickable(self, basic_loader: DOMDataLoader) -> None:
+        """Login page has clickable elements."""
+        results = basic_loader.get_clickable_elements(snapshot_index=3)
+        assert len(results) == 1
+        elements = results[0]["elements"]
+        tags = [e["tag"] for e in elements]
+        # Inputs, button, and link should be clickable
+        assert "INPUT" in tags or "BUTTON" in tags
+
+    def test_clickable_elements_have_attrs(self, basic_loader: DOMDataLoader) -> None:
+        """Clickable elements include their attributes."""
+        results = basic_loader.get_clickable_elements(snapshot_index=0)
+        for element in results[0]["elements"]:
+            assert "tag" in element
+            assert "node_index" in element
+            assert "attrs" in element
+
+    def test_all_snapshots(self, basic_loader: DOMDataLoader) -> None:
+        """All pages have some clickable elements."""
+        results = basic_loader.get_clickable_elements()
+        assert len(results) == 4
+
+
+# --- Snapshot Diff Tests ---
+
+
+class TestSnapshotDiff:
+    """Tests for get_snapshot_diff method."""
+
+    def test_diff_valid_snapshots(self, basic_loader: DOMDataLoader) -> None:
+        """Diff between two valid snapshots."""
+        diff = basic_loader.get_snapshot_diff(0, 1)
+        assert diff is not None
+        assert diff["added_count"] > 0
+        assert diff["removed_count"] > 0
+        assert diff["shared_count"] > 0
+
+    def test_diff_snapshot_urls(self, basic_loader: DOMDataLoader) -> None:
+        """Diff includes snapshot URLs."""
+        diff = basic_loader.get_snapshot_diff(0, 2)
+        assert diff is not None
+        assert diff["snapshot_a"]["url"] == "https://www.example.com/"
+        assert diff["snapshot_b"]["url"] == "https://www.example.com/search?q=laptops"
+
+    def test_diff_invalid_index(self, basic_loader: DOMDataLoader) -> None:
+        """Return None when either index is invalid."""
+        assert basic_loader.get_snapshot_diff(0, 99) is None
+        assert basic_loader.get_snapshot_diff(99, 0) is None
+
+
+# --- Navigation Sequence Tests ---
+
+
+class TestNavigationSequence:
+    """Tests for get_navigation_sequence method."""
+
+    def test_navigation_sequence_length(self, basic_loader: DOMDataLoader) -> None:
+        """Navigation sequence has one entry per snapshot."""
+        sequence = basic_loader.get_navigation_sequence()
+        assert len(sequence) == 4
+
+    def test_navigation_sequence_ordered_by_timestamp(self, basic_loader: DOMDataLoader) -> None:
+        """Navigation sequence is ordered by timestamp."""
+        sequence = basic_loader.get_navigation_sequence()
+        timestamps = [s["timestamp"] for s in sequence]
+        assert timestamps == sorted(timestamps)
+
+    def test_navigation_sequence_hosts(self, basic_loader: DOMDataLoader) -> None:
+        """Navigation sequence correctly extracts hosts."""
+        sequence = basic_loader.get_navigation_sequence()
+        hosts = [s["host"] for s in sequence]
+        assert "www.example.com" in hosts
+        assert "auth.example.com" in hosts
+
+
+# --- Abstract Interface Tests ---
+
+
+class TestAbstractInterface:
+    """Tests for inherited AbstractDataLoader methods."""
+
+    def test_search_by_terms(self, basic_loader: DOMDataLoader) -> None:
+        """Inherited search_by_terms works on DOM string tables."""
+        results = basic_loader.search_by_terms(["laptop"])
+        assert len(results) > 0
+
+    def test_search_content(self, basic_loader: DOMDataLoader) -> None:
+        """Inherited search_content works on DOM string tables."""
+        results = basic_loader.search_content("password")
+        assert len(results) > 0
+
+    def test_get_entry_id(self, basic_loader: DOMDataLoader) -> None:
+        """get_entry_id returns string index."""
+        entry = basic_loader.entries[0]
+        assert basic_loader.get_entry_id(entry) == "0"
+
+    def test_get_searchable_content(self, basic_loader: DOMDataLoader) -> None:
+        """get_searchable_content returns joined string table."""
+        entry = basic_loader.entries[0]
+        content = basic_loader.get_searchable_content(entry)
+        assert content is not None
+        assert "Welcome to Example" in content
+
+
+# --- Meta Tags Tests ---
+
+
+class TestGetMetaTags:
+    """Tests for get_meta_tags method."""
+
+    def test_home_page_has_meta_tags(self, basic_loader: DOMDataLoader) -> None:
+        """Home page has meta tags (csrf-token and viewport)."""
+        results = basic_loader.get_meta_tags(snapshot_index=0)
+        assert len(results) == 1
+        meta_tags = results[0]["meta_tags"]
+        assert len(meta_tags) == 2
+
+    def test_csrf_meta_tag_attrs(self, basic_loader: DOMDataLoader) -> None:
+        """CSRF meta tag has name and content attributes."""
+        results = basic_loader.get_meta_tags(snapshot_index=0)
+        meta_tags = results[0]["meta_tags"]
+        csrf_meta = [m for m in meta_tags if m["attrs"].get("name") == "csrf-token"]
+        assert len(csrf_meta) == 1
+        assert csrf_meta[0]["attrs"]["content"] == "abc123def456"
+
+    def test_viewport_meta_tag(self, basic_loader: DOMDataLoader) -> None:
+        """Viewport meta tag is present."""
+        results = basic_loader.get_meta_tags(snapshot_index=0)
+        meta_tags = results[0]["meta_tags"]
+        viewport = [m for m in meta_tags if m["attrs"].get("name") == "viewport"]
+        assert len(viewport) == 1
+        assert viewport[0]["attrs"]["content"] == "width=device-width"
+
+    def test_login_page_has_csrf_meta(self, basic_loader: DOMDataLoader) -> None:
+        """Login page has a CSRF meta tag."""
+        results = basic_loader.get_meta_tags(snapshot_index=3)
+        assert len(results) == 1
+        meta_tags = results[0]["meta_tags"]
+        csrf_meta = [m for m in meta_tags if m["attrs"].get("name") == "csrf-token"]
+        assert len(csrf_meta) == 1
+        assert csrf_meta[0]["attrs"]["content"] == "login_csrf_abc789"
+
+    def test_meta_tags_no_clickable(self, basic_loader: DOMDataLoader) -> None:
+        """Meta tag results don't include is_clickable (simplified output)."""
+        results = basic_loader.get_meta_tags(snapshot_index=0)
+        for meta in results[0]["meta_tags"]:
+            assert "is_clickable" not in meta
+
+    def test_pages_without_meta(self, basic_loader: DOMDataLoader) -> None:
+        """Pages without meta tags return empty results for that snapshot."""
+        results = basic_loader.get_meta_tags(snapshot_index=1)
+        assert len(results) == 0
+
+    def test_all_snapshots(self, basic_loader: DOMDataLoader) -> None:
+        """Searching all snapshots returns results from pages with meta tags."""
+        results = basic_loader.get_meta_tags()
+        # Home page and login page have meta tags
+        assert len(results) == 2
+
+
+# --- Script Tags Tests ---
+
+
+class TestGetScripts:
+    """Tests for get_scripts method."""
+
+    def test_home_page_has_scripts(self, basic_loader: DOMDataLoader) -> None:
+        """Home page has script tags."""
+        results = basic_loader.get_scripts(snapshot_index=0)
+        assert len(results) == 1
+        scripts = results[0]["scripts"]
+        assert len(scripts) == 2  # __NEXT_DATA__ + external
+
+    def test_next_data_script(self, basic_loader: DOMDataLoader) -> None:
+        """__NEXT_DATA__ script has id, type, and inline content."""
+        results = basic_loader.get_scripts(snapshot_index=0)
+        scripts = results[0]["scripts"]
+        next_data = [s for s in scripts if s["attrs"].get("id") == "__NEXT_DATA__"]
+        assert len(next_data) == 1
+        assert next_data[0]["attrs"]["type"] == "application/json"
+        assert "inline_content" in next_data[0]
+        assert '"page":"/"' in next_data[0]["inline_content"]
+
+    def test_external_script(self, basic_loader: DOMDataLoader) -> None:
+        """External script has src attribute and no inline content."""
+        results = basic_loader.get_scripts(snapshot_index=0)
+        scripts = results[0]["scripts"]
+        external = [s for s in scripts if "src" in s["attrs"]]
+        assert len(external) == 1
+        assert external[0]["attrs"]["src"] == "/js/app.js"
+        assert "inline_content" not in external[0]
+
+    def test_inline_content_truncation(self, basic_loader: DOMDataLoader) -> None:
+        """Inline content is truncated when exceeding max_inline_chars."""
+        results = basic_loader.get_scripts(snapshot_index=0, max_inline_chars=10)
+        scripts = results[0]["scripts"]
+        next_data = [s for s in scripts if s["attrs"].get("id") == "__NEXT_DATA__"]
+        assert len(next_data) == 1
+        assert next_data[0].get("inline_content_truncated") is True
+        assert next_data[0]["inline_content"].endswith("...")
+
+    def test_pages_without_scripts(self, basic_loader: DOMDataLoader) -> None:
+        """Pages without scripts return empty results."""
+        results = basic_loader.get_scripts(snapshot_index=1)
+        assert len(results) == 0
+
+    def test_all_snapshots(self, basic_loader: DOMDataLoader) -> None:
+        """Only snapshots with scripts are returned."""
+        results = basic_loader.get_scripts()
+        assert len(results) == 1  # Only home page has scripts
+
+
+# --- Hidden Inputs Tests ---
+
+
+class TestGetHiddenInputs:
+    """Tests for get_hidden_inputs method."""
+
+    def test_home_page_has_hidden_input(self, basic_loader: DOMDataLoader) -> None:
+        """Home page has a hidden CSRF input."""
+        results = basic_loader.get_hidden_inputs(snapshot_index=0)
+        assert len(results) == 1
+        elements = results[0]["elements"]
+        assert len(elements) == 1
+
+    def test_hidden_input_attrs(self, basic_loader: DOMDataLoader) -> None:
+        """Hidden input has correct name and value attributes."""
+        results = basic_loader.get_hidden_inputs(snapshot_index=0)
+        el = results[0]["elements"][0]
+        assert el["attrs"]["type"] == "hidden"
+        assert el["attrs"]["name"] == "csrf_token"
+        assert el["attrs"]["value"] == "hidden_csrf_value"
+
+    def test_login_page_hidden_input(self, basic_loader: DOMDataLoader) -> None:
+        """Login page has a hidden form token."""
+        results = basic_loader.get_hidden_inputs(snapshot_index=3)
+        assert len(results) == 1
+        el = results[0]["elements"][0]
+        assert el["attrs"]["name"] == "_token"
+        assert el["attrs"]["value"] == "form_token_xyz"
+
+    def test_pages_without_hidden_inputs(self, basic_loader: DOMDataLoader) -> None:
+        """Pages without hidden inputs return empty."""
+        results = basic_loader.get_hidden_inputs(snapshot_index=1)
+        assert len(results) == 0
+
+    def test_all_snapshots(self, basic_loader: DOMDataLoader) -> None:
+        """Searching all snapshots finds hidden inputs across pages."""
+        results = basic_loader.get_hidden_inputs()
+        # Home page and login page have hidden inputs
+        assert len(results) == 2
+
+    def test_excludes_non_hidden_inputs(self, basic_loader: DOMDataLoader) -> None:
+        """Regular inputs (text, email, etc.) are excluded."""
+        results = basic_loader.get_hidden_inputs(snapshot_index=0)
+        for el in results[0]["elements"]:
+            assert el["attrs"].get("type") == "hidden"
diff --git a/tests/unit/data_loaders/test_network_data_loader.py b/tests/unit/data_loaders/test_network_data_loader.py
index e52a97f8..67303dd8 100644
--- a/tests/unit/data_loaders/test_network_data_loader.py
+++ b/tests/unit/data_loaders/test_network_data_loader.py
@@ -357,6 +357,21 @@ def test_unique_terms_found_counts_each_term_once(self, search_store: NetworkDat
             # total_hits can be greater (multiple occurrences of same term)
             assert r["total_hits"] >= r["unique_terms_found"]
 
+    def test_search_matches_url_path(self, search_store: NetworkDataLoader) -> None:
+        """Search terms in URL paths should match even if absent from response body."""
+        # "api/stations" appears in search-002's URL but NOT in its response body
+        results = search_store.search_entries_by_terms(["api/stations"])
+        assert len(results) > 0
+        matched_urls = [r["url"] for r in results]
+        assert any("api/stations" in url for url in matched_urls)
+
+    def test_search_url_only_term_finds_entry(self, search_store: NetworkDataLoader) -> None:
+        """An entry whose response body has no matching terms is still found via URL."""
+        # "v1/prices" appears only in search-005's URL, not in its response body text
+        results = search_store.search_entries_by_terms(["v1/prices"])
+        assert len(results) > 0
+        assert any(r["id"] == "search-005" for r in results)
+
 
 # --- Host Stats Tests ---
 
diff --git a/tests/unit/data_models/api_indexing/__init__.py b/tests/unit/data_models/api_indexing/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/data_models/api_indexing/test_exploration.py b/tests/unit/data_models/api_indexing/test_exploration.py
new file mode 100644
index 00000000..b7cdcfd4
--- /dev/null
+++ b/tests/unit/data_models/api_indexing/test_exploration.py
@@ -0,0 +1,212 @@
+"""
+tests/unit/data_models/api_indexing/test_exploration.py
+
+Tests for exploration summary models: NetworkExplorationSummary,
+StorageExplorationSummary, DOMExplorationSummary, UIExplorationSummary.
+"""
+
+import pytest
+
+from bluebox.data_models.api_indexing.exploration import (
+    DOMExplorationSummary,
+    EndpointCategory,
+    EndpointCluster,
+    InterestLevel,
+    NetworkExplorationSummary,
+    StorageExplorationSummary,
+    UIExplorationSummary,
+)
+
+
+class TestEndpointCategory:
+    """Tests for EndpointCategory enum."""
+
+    def test_all_values(self) -> None:
+        expected = {"action", "data", "auth", "navigation"}
+        assert {c.value for c in EndpointCategory} == expected
+
+
+class TestInterestLevel:
+    """Tests for InterestLevel enum."""
+
+    def test_all_values(self) -> None:
+        expected = {"high", "medium", "low"}
+        assert {l.value for l in InterestLevel} == expected
+
+
+class TestEndpointCluster:
+    """Tests for EndpointCluster."""
+
+    def test_creation(self) -> None:
+        cluster = EndpointCluster(
+            url_pattern="/api/v2/search",
+            method="POST",
+            category=EndpointCategory.ACTION,
+            hit_count=3,
+            description="Train route search",
+            interest=InterestLevel.HIGH,
+            request_ids=["req-1", "req-2", "req-3"],
+        )
+        assert cluster.url_pattern == "/api/v2/search"
+        assert cluster.method == "POST"
+        assert cluster.category == EndpointCategory.ACTION
+        assert cluster.hit_count == 3
+        assert cluster.interest == InterestLevel.HIGH
+        assert len(cluster.request_ids) == 3
+
+    def test_defaults(self) -> None:
+        cluster = EndpointCluster(
+            url_pattern="/api/data",
+            method="GET",
+            category=EndpointCategory.DATA,
+            hit_count=1,
+            description="Get data",
+            interest=InterestLevel.LOW,
+        )
+        assert cluster.request_ids == []
+
+
+class TestNetworkExplorationSummary:
+    """Tests for NetworkExplorationSummary."""
+
+    def test_minimal(self) -> None:
+        summary = NetworkExplorationSummary(total_requests=150)
+        assert summary.total_requests == 150
+        assert summary.endpoints == []
+        assert summary.auth_observations == []
+        assert summary.narrative == ""
+
+    def test_full(self) -> None:
+        cluster = EndpointCluster(
+            url_pattern="/api/search",
+            method="POST",
+            category=EndpointCategory.ACTION,
+            hit_count=2,
+            description="Search endpoint",
+            interest=InterestLevel.HIGH,
+        )
+        summary = NetworkExplorationSummary(
+            total_requests=200,
+            endpoints=[cluster],
+            auth_observations=["Bearer JWT on all /api/ requests"],
+            narrative="Standard REST API with JWT auth",
+        )
+        assert len(summary.endpoints) == 1
+        assert summary.endpoints[0].interest == InterestLevel.HIGH
+        assert "Bearer" in summary.auth_observations[0]
+
+    def test_roundtrip(self) -> None:
+        cluster = EndpointCluster(
+            url_pattern="/api/data",
+            method="GET",
+            category=EndpointCategory.DATA,
+            hit_count=5,
+            description="Data endpoint",
+            interest=InterestLevel.MEDIUM,
+        )
+        summary = NetworkExplorationSummary(
+            total_requests=100,
+            endpoints=[cluster],
+        )
+        data = summary.model_dump()
+        summary2 = NetworkExplorationSummary.model_validate(data)
+        assert summary2.total_requests == 100
+        assert summary2.endpoints[0].url_pattern == "/api/data"
+
+
+class TestStorageExplorationSummary:
+    """Tests for StorageExplorationSummary."""
+
+    def test_minimal(self) -> None:
+        summary = StorageExplorationSummary(total_events=50, noise_filtered=30)
+        assert summary.total_events == 50
+        assert summary.noise_filtered == 30
+        assert summary.tokens == []
+        assert summary.data_blocks == []
+
+    def test_with_tokens_and_data(self) -> None:
+        summary = StorageExplorationSummary(
+            total_events=100,
+            noise_filtered=70,
+            tokens=[
+                "sessionStorage[auth_token] -- JWT (~1.2kb), written once on page load",
+                "cookie[csrf_token] -- 64-char hex, rotates per request",
+            ],
+            data_blocks=[
+                "localStorage[user_profile] -- JSON (~2kb) with name, email, subscription tier",
+            ],
+            narrative="Standard JWT auth with CSRF protection",
+        )
+        assert len(summary.tokens) == 2
+        assert len(summary.data_blocks) == 1
+        assert "JWT" in summary.tokens[0]
+
+
+class TestDOMExplorationSummary:
+    """Tests for DOMExplorationSummary."""
+
+    def test_minimal(self) -> None:
+        summary = DOMExplorationSummary(total_snapshots=5)
+        assert summary.total_snapshots == 5
+        assert summary.pages == []
+        assert summary.forms == []
+        assert summary.embedded_tokens == []
+        assert summary.data_blobs == []
+        assert summary.tables == []
+        assert summary.inferred_framework == ""
+
+    def test_full(self) -> None:
+        summary = DOMExplorationSummary(
+            total_snapshots=8,
+            pages=["/book/flights -- flight search page with form"],
+            forms=["/book/flights search form -- POST to /api/search, fields: origin, destination, date"],
+            embedded_tokens=["meta[name=csrf-token] -- 64-char hex, rotates per page load"],
+            data_blobs=["script#__NEXT_DATA__ -- JSON (~15kb) with session data"],
+            tables=["results table -- columns: Departure, Arrival, Price. 12 rows"],
+            inferred_framework="Next.js",
+            narrative="Server-side rendered with client hydration",
+        )
+        assert len(summary.pages) == 1
+        assert len(summary.forms) == 1
+        assert summary.inferred_framework == "Next.js"
+
+
+class TestUIExplorationSummary:
+    """Tests for UIExplorationSummary."""
+
+    def test_minimal(self) -> None:
+        summary = UIExplorationSummary(total_events=25)
+        assert summary.total_events == 25
+        assert summary.user_inputs == []
+        assert summary.clicks == []
+        assert summary.navigation_flow == []
+        assert summary.inferred_intent == ""
+
+    def test_full(self) -> None:
+        summary = UIExplorationSummary(
+            total_events=40,
+            user_inputs=[
+                "input#origin (text) -- user typed 'LAX'",
+                "input#destination (text) -- user typed 'JFK'",
+            ],
+            clicks=[
+                "button.search-btn 'Search Flights' on /book/flights -- submits search form",
+            ],
+            navigation_flow=[
+                "/ (Home) -> /book/flights (Search) -> /results (Results)",
+            ],
+            inferred_intent="Search for one-way flights from LAX to JFK",
+            narrative="Straightforward search flow, no pagination",
+        )
+        assert len(summary.user_inputs) == 2
+        assert len(summary.clicks) == 1
+        assert "LAX" in summary.inferred_intent
+
+    def test_roundtrip(self) -> None:
+        summary = UIExplorationSummary(
+            total_events=10,
+            inferred_intent="Book a train ticket",
+        )
+        data = summary.model_dump()
+        summary2 = UIExplorationSummary.model_validate(data)
+        assert summary2.inferred_intent == "Book a train ticket"
diff --git a/tests/unit/data_models/orchestration/__init__.py b/tests/unit/data_models/orchestration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/data_models/orchestration/test_experiment.py b/tests/unit/data_models/orchestration/test_experiment.py
new file mode 100644
index 00000000..525498f9
--- /dev/null
+++ b/tests/unit/data_models/orchestration/test_experiment.py
@@ -0,0 +1,223 @@
+"""
+tests/unit/data_models/orchestration/test_experiment.py
+
+Tests for ExperimentEntry, ExperimentLog, ProvenArtifacts, ExperimentTakeaway.
+"""
+
+import pytest
+
+from bluebox.data_models.orchestration.experiment import (
+    ArtifactType,
+    ExperimentEntry,
+    ExperimentLog,
+    ExperimentStatus,
+    ExperimentTakeaway,
+    ExperimentVerdict,
+    ProvenArtifacts,
+)
+
+
+class TestExperimentEnums:
+    """Tests for experiment-related enums."""
+
+    def test_verdict_values(self) -> None:
+        assert set(ExperimentVerdict) == {
+            ExperimentVerdict.CONFIRMED,
+            ExperimentVerdict.REFUTED,
+            ExperimentVerdict.PARTIAL,
+            ExperimentVerdict.NEEDS_FOLLOWUP,
+        }
+
+    def test_status_values(self) -> None:
+        assert set(ExperimentStatus) == {
+            ExperimentStatus.PENDING,
+            ExperimentStatus.RUNNING,
+            ExperimentStatus.DONE,
+            ExperimentStatus.FAILED,
+        }
+
+    def test_artifact_type_values(self) -> None:
+        assert set(ArtifactType) == {
+            ArtifactType.FETCH,
+            ArtifactType.NAVIGATION,
+            ArtifactType.TOKEN,
+            ArtifactType.PARAMETER,
+        }
+
+
+class TestExperimentTakeaway:
+    """Tests for ExperimentTakeaway."""
+
+    def test_minimal(self) -> None:
+        t = ExperimentTakeaway(claim="Auth uses Bearer JWT from sessionStorage")
+        assert t.claim == "Auth uses Bearer JWT from sessionStorage"
+        assert t.evidence is None
+        assert t.tags == []
+
+    def test_full(self) -> None:
+        t = ExperimentTakeaway(
+            claim="CSRF token rotates per page load",
+            evidence="Observed 3 different values across 3 page loads",
+            how_to_apply_next="Extract CSRF from meta tag before each POST",
+            confidence=0.9,
+            tags=["auth", "anti_bot"],
+        )
+        assert t.confidence == 0.9
+        assert "auth" in t.tags
+
+
+class TestExperimentEntry:
+    """Tests for ExperimentEntry."""
+
+    def test_defaults(self) -> None:
+        entry = ExperimentEntry(
+            hypothesis="POST /api/search returns train schedules",
+            rationale="Observed this endpoint during recording",
+            methodology="Send a POST with sample params and check response",
+        )
+        assert len(entry.id) == 6
+        assert entry.status == ExperimentStatus.PENDING
+        assert entry.verdict is None
+        assert entry.takeaways == []
+        assert entry.output is None
+
+    def test_unique_ids(self) -> None:
+        entries = [
+            ExperimentEntry(hypothesis="h", rationale="r", methodology="m")
+            for _ in range(20)
+        ]
+        assert len({e.id for e in entries}) == 20
+
+    def test_with_result(self) -> None:
+        entry = ExperimentEntry(
+            hypothesis="test",
+            rationale="test",
+            methodology="test",
+            status=ExperimentStatus.DONE,
+            verdict=ExperimentVerdict.CONFIRMED,
+            summary="Endpoint works as expected",
+            takeaways=[ExperimentTakeaway(claim="It works")],
+            output={"status": 200, "data": {"trains": []}},
+        )
+        assert entry.verdict == ExperimentVerdict.CONFIRMED
+        assert len(entry.takeaways) == 1
+
+
+class TestProvenArtifacts:
+    """Tests for ProvenArtifacts."""
+
+    def test_defaults_empty(self) -> None:
+        pa = ProvenArtifacts()
+        assert pa.fetches == []
+        assert pa.navigations == []
+        assert pa.tokens == []
+        assert pa.parameters == []
+
+    def test_accumulation(self) -> None:
+        pa = ProvenArtifacts()
+        pa.fetches.append({"method": "POST", "url": "/api/search"})
+        pa.tokens.append({"name": "csrf", "source": "meta tag"})
+        pa.parameters.append({"name": "origin", "type": "string"})
+        assert len(pa.fetches) == 1
+        assert len(pa.tokens) == 1
+        assert len(pa.parameters) == 1
+
+
+class TestExperimentLog:
+    """Tests for ExperimentLog."""
+
+    def _make_log(self) -> ExperimentLog:
+        return ExperimentLog(user_task="Search for train tickets")
+
+    def test_creation(self) -> None:
+        log = self._make_log()
+        assert log.user_task == "Search for train tickets"
+        assert log.experiments == []
+        assert log.unresolved == []
+
+    def test_add_experiment(self) -> None:
+        log = self._make_log()
+        entry = ExperimentEntry(hypothesis="h", rationale="r", methodology="m")
+        result = log.add_experiment(entry)
+        assert result is entry
+        assert len(log.experiments) == 1
+
+    def test_get_experiment_found(self) -> None:
+        log = self._make_log()
+        entry = ExperimentEntry(hypothesis="h", rationale="r", methodology="m")
+        log.add_experiment(entry)
+        found = log.get_experiment(entry.id)
+        assert found is entry
+
+    def test_get_experiment_not_found(self) -> None:
+        log = self._make_log()
+        assert log.get_experiment("nonexistent") is None
+
+    def test_get_running_experiments(self) -> None:
+        log = self._make_log()
+        e1 = ExperimentEntry(hypothesis="h1", rationale="r", methodology="m", status=ExperimentStatus.RUNNING)
+        e2 = ExperimentEntry(hypothesis="h2", rationale="r", methodology="m", status=ExperimentStatus.DONE)
+        e3 = ExperimentEntry(hypothesis="h3", rationale="r", methodology="m", status=ExperimentStatus.RUNNING)
+        log.add_experiment(e1)
+        log.add_experiment(e2)
+        log.add_experiment(e3)
+        running = log.get_running_experiments()
+        assert len(running) == 2
+
+    def test_get_confirmed_experiments(self) -> None:
+        log = self._make_log()
+        e1 = ExperimentEntry(hypothesis="h1", rationale="r", methodology="m", verdict=ExperimentVerdict.CONFIRMED)
+        e2 = ExperimentEntry(hypothesis="h2", rationale="r", methodology="m", verdict=ExperimentVerdict.REFUTED)
+        log.add_experiment(e1)
+        log.add_experiment(e2)
+        confirmed = log.get_confirmed_experiments()
+        assert len(confirmed) == 1
+        assert confirmed[0].hypothesis == "h1"
+
+    def test_to_summary_empty(self) -> None:
+        log = self._make_log()
+        assert log.to_summary() == "(no experiments yet)"
+
+    def test_to_summary_with_experiments(self) -> None:
+        log = self._make_log()
+        entry = ExperimentEntry(
+            hypothesis="API uses Bearer auth",
+            rationale="saw headers",
+            methodology="test",
+            status=ExperimentStatus.DONE,
+            verdict=ExperimentVerdict.CONFIRMED,
+            summary="Confirmed Bearer auth pattern",
+        )
+        log.add_experiment(entry)
+        summary = log.to_summary()
+        assert "Experiment History" in summary
+        assert "API uses Bearer auth" in summary
+        assert "Confirmed Bearer auth pattern" in summary
+
+    def test_to_summary_with_proven_artifacts(self) -> None:
+        log = self._make_log()
+        log.proven.fetches.append({"method": "POST", "url": "/api/search"})
+        log.proven.tokens.append({"name": "csrf", "source": "meta"})
+        summary = log.to_summary()
+        assert "Proven Artifacts" in summary
+        assert "FETCH" in summary
+        assert "TOKEN" in summary
+
+    def test_to_summary_with_unresolved(self) -> None:
+        log = self._make_log()
+        log.unresolved.append("How does pagination work?")
+        summary = log.to_summary()
+        assert "Unresolved Questions" in summary
+        assert "pagination" in summary
+
+    def test_to_summary_with_takeaways(self) -> None:
+        log = self._make_log()
+        entry = ExperimentEntry(
+            hypothesis="h",
+            rationale="r",
+            methodology="m",
+            takeaways=[ExperimentTakeaway(claim="CSRF rotates per page load")],
+        )
+        log.add_experiment(entry)
+        summary = log.to_summary()
+        assert "takeaways" in summary
diff --git a/tests/unit/data_models/orchestration/test_inspection.py b/tests/unit/data_models/orchestration/test_inspection.py
new file mode 100644
index 00000000..e808dd46
--- /dev/null
+++ b/tests/unit/data_models/orchestration/test_inspection.py
@@ -0,0 +1,148 @@
+"""
+tests/unit/data_models/orchestration/test_inspection.py
+
+Tests for DimensionScore, RoutineInspectionDimensions, RoutineInspectionResult.
+"""
+
+import pytest
+
+from bluebox.data_models.orchestration.inspection import (
+    DimensionScore,
+    RoutineInspectionDimensions,
+    RoutineInspectionResult,
+)
+
+
+class TestDimensionScore:
+    """Tests for DimensionScore with clamping validator."""
+
+    def test_normal_score(self) -> None:
+        ds = DimensionScore(score=7, reasoning="Good")
+        assert ds.score == 7
+
+    def test_score_clamped_above_10(self) -> None:
+        ds = DimensionScore(score=15, reasoning="over")
+        assert ds.score == 10
+
+    def test_score_clamped_below_0(self) -> None:
+        ds = DimensionScore(score=-5, reasoning="under")
+        assert ds.score == 0
+
+    def test_score_from_string(self) -> None:
+        ds = DimensionScore(score="8", reasoning="from string")
+        assert ds.score == 8
+
+    def test_score_string_clamped(self) -> None:
+        ds = DimensionScore(score="20", reasoning="over string")
+        assert ds.score == 10
+
+    def test_score_from_float(self) -> None:
+        ds = DimensionScore(score=7.9, reasoning="float")
+        assert ds.score == 7
+
+    def test_bool_true(self) -> None:
+        ds = DimensionScore(score=True, reasoning="bool")
+        assert ds.score == 1
+
+    def test_bool_false(self) -> None:
+        ds = DimensionScore(score=False, reasoning="bool")
+        assert ds.score == 0
+
+    def test_boundary_0(self) -> None:
+        ds = DimensionScore(score=0, reasoning="zero")
+        assert ds.score == 0
+
+    def test_boundary_10(self) -> None:
+        ds = DimensionScore(score=10, reasoning="ten")
+        assert ds.score == 10
+
+
+def _make_dims(score: int = 8) -> RoutineInspectionDimensions:
+    ds = DimensionScore(score=score, reasoning="test")
+    return RoutineInspectionDimensions(
+        task_completion=ds,
+        data_quality=ds,
+        parameter_coverage=ds,
+        routine_robustness=ds,
+        structural_correctness=ds,
+        documentation_quality=ds,
+    )
+
+
+class TestRoutineInspectionDimensions:
+    """Tests for RoutineInspectionDimensions."""
+
+    def test_all_dimensions(self) -> None:
+        dims = _make_dims(9)
+        assert dims.task_completion.score == 9
+        assert dims.data_quality.score == 9
+        assert dims.parameter_coverage.score == 9
+        assert dims.routine_robustness.score == 9
+        assert dims.structural_correctness.score == 9
+        assert dims.documentation_quality.score == 9
+
+    def test_missing_dimension_raises(self) -> None:
+        ds = DimensionScore(score=5, reasoning="test")
+        with pytest.raises(Exception):
+            RoutineInspectionDimensions(task_completion=ds)
+
+
+class TestRoutineInspectionResult:
+    """Tests for RoutineInspectionResult."""
+
+    def test_passing(self) -> None:
+        result = RoutineInspectionResult(
+            overall_pass=True,
+            overall_score=85,
+            dimensions=_make_dims(),
+            summary="Looks good.",
+        )
+        assert result.overall_pass is True
+        assert result.overall_score == 85
+        assert result.blocking_issues == []
+        assert result.recommendations == []
+
+    def test_failing_with_issues(self) -> None:
+        result = RoutineInspectionResult(
+            overall_pass=False,
+            overall_score=35,
+            dimensions=_make_dims(3),
+            summary="Needs work.",
+            blocking_issues=["Missing auth", "Wrong endpoint"],
+            recommendations=["Add retry"],
+        )
+        assert result.overall_pass is False
+        assert len(result.blocking_issues) == 2
+        assert len(result.recommendations) == 1
+
+    def test_score_bounds_valid(self) -> None:
+        RoutineInspectionResult(
+            overall_pass=True, overall_score=0, dimensions=_make_dims(), summary="s"
+        )
+        RoutineInspectionResult(
+            overall_pass=True, overall_score=100, dimensions=_make_dims(), summary="s"
+        )
+
+    def test_score_out_of_bounds(self) -> None:
+        with pytest.raises(Exception):
+            RoutineInspectionResult(
+                overall_pass=True, overall_score=101, dimensions=_make_dims(), summary="s"
+            )
+        with pytest.raises(Exception):
+            RoutineInspectionResult(
+                overall_pass=True, overall_score=-1, dimensions=_make_dims(), summary="s"
+            )
+
+    def test_roundtrip(self) -> None:
+        result = RoutineInspectionResult(
+            overall_pass=True,
+            overall_score=80,
+            dimensions=_make_dims(),
+            summary="Good routine.",
+            blocking_issues=[],
+            recommendations=["Minor: add descriptions"],
+        )
+        data = result.model_dump()
+        result2 = RoutineInspectionResult.model_validate(data)
+        assert result2.overall_score == 80
+        assert result2.recommendations == ["Minor: add descriptions"]
diff --git a/tests/unit/data_models/orchestration/test_ledger.py b/tests/unit/data_models/orchestration/test_ledger.py
new file mode 100644
index 00000000..b59e2c62
--- /dev/null
+++ b/tests/unit/data_models/orchestration/test_ledger.py
@@ -0,0 +1,357 @@
+"""
+tests/unit/data_models/orchestration/test_ledger.py
+
+Tests for DiscoveryLedger, RoutineSpec, RoutineAttempt, RoutineCatalog, ShippedRoutine.
+"""
+
+import pytest
+
+from bluebox.data_models.orchestration.experiment import (
+    ExperimentEntry,
+    ExperimentStatus,
+    ExperimentVerdict,
+)
+from bluebox.data_models.orchestration.ledger import (
+    DiscoveryLedger,
+    RoutineAttempt,
+    RoutineAttemptStatus,
+    RoutineCatalog,
+    RoutineSpec,
+    RoutineSpecStatus,
+    ShippedRoutine,
+)
+
+
+class TestRoutineSpecStatus:
+    """Tests for RoutineSpecStatus enum."""
+
+    def test_all_statuses(self) -> None:
+        expected = {"planned", "experimenting", "assembling", "validating", "shipped", "failed"}
+        assert {s.value for s in RoutineSpecStatus} == expected
+
+
+class TestRoutineAttemptStatus:
+    """Tests for RoutineAttemptStatus enum."""
+
+    def test_all_statuses(self) -> None:
+        expected = {"draft", "validating", "executing", "inspecting", "passed", "failed"}
+        assert {s.value for s in RoutineAttemptStatus} == expected
+
+
+class TestRoutineSpec:
+    """Tests for RoutineSpec."""
+
+    def test_defaults(self) -> None:
+        spec = RoutineSpec(name="get_prices", description="Get train prices")
+        assert len(spec.id) == 6
+        assert spec.status == RoutineSpecStatus.PLANNED
+        assert spec.priority == 1
+        assert spec.experiment_ids == []
+        assert spec.attempt_ids == []
+        assert spec.shipped_attempt_id is None
+
+    def test_unique_ids(self) -> None:
+        specs = [RoutineSpec(name="s", description="d") for _ in range(20)]
+        assert len({s.id for s in specs}) == 20
+
+
+class TestRoutineAttempt:
+    """Tests for RoutineAttempt."""
+
+    def test_defaults(self) -> None:
+        attempt = RoutineAttempt(
+            routine_spec_id="abc",
+            routine_json={"name": "test", "operations": []},
+        )
+        assert len(attempt.id) == 6
+        assert attempt.status == RoutineAttemptStatus.DRAFT
+        assert attempt.test_parameters is None
+        assert attempt.execution_result is None
+        assert attempt.overall_pass is None
+        assert attempt.blocking_issues == []
+
+    def test_with_inspection_results(self) -> None:
+        attempt = RoutineAttempt(
+            routine_spec_id="abc",
+            routine_json={"name": "test", "operations": []},
+            status=RoutineAttemptStatus.FAILED,
+            overall_pass=False,
+            blocking_issues=["Missing auth header", "Wrong URL"],
+            recommendations=["Add retry logic"],
+        )
+        assert attempt.overall_pass is False
+        assert len(attempt.blocking_issues) == 2
+
+    def test_lineage(self) -> None:
+        parent = RoutineAttempt(
+            routine_spec_id="abc",
+            routine_json={"name": "v1", "operations": []},
+        )
+        child = RoutineAttempt(
+            routine_spec_id="abc",
+            routine_json={"name": "v2", "operations": []},
+            parent_attempt_id=parent.id,
+            changes_from_parent="Added auth header to fetch",
+        )
+        assert child.parent_attempt_id == parent.id
+        assert "auth header" in child.changes_from_parent
+
+
+class TestShippedRoutine:
+    """Tests for ShippedRoutine."""
+
+    def test_creation(self) -> None:
+        sr = ShippedRoutine(
+            routine_spec_id="abc",
+            routine_json={"name": "search_trains", "operations": []},
+            name="search_trains",
+            description="Search for train schedules",
+            when_to_use="When user wants to find train times and prices",
+            parameters_summary=["origin: departure station", "destination: arrival station"],
+            inspection_score=85,
+        )
+        assert sr.name == "search_trains"
+        assert sr.inspection_score == 85
+        assert len(sr.parameters_summary) == 2
+
+
+class TestRoutineCatalog:
+    """Tests for RoutineCatalog."""
+
+    def test_defaults(self) -> None:
+        catalog = RoutineCatalog(
+            site="amtrak.com",
+            user_task="Search for train tickets",
+        )
+        assert catalog.routines == []
+        assert catalog.failed_routines == []
+        assert catalog.total_experiments == 0
+        assert catalog.total_attempts == 0
+
+    def test_with_routines(self) -> None:
+        sr = ShippedRoutine(
+            routine_spec_id="abc",
+            routine_json={},
+            name="search",
+            description="Search trains",
+            when_to_use="Find trains",
+            inspection_score=90,
+        )
+        catalog = RoutineCatalog(
+            site="amtrak.com",
+            user_task="Search for train tickets",
+            routines=[sr],
+            total_experiments=5,
+            total_attempts=3,
+        )
+        assert len(catalog.routines) == 1
+        assert catalog.total_experiments == 5
+
+
+class TestDiscoveryLedger:
+    """Tests for DiscoveryLedger."""
+
+    def _make_ledger(self) -> DiscoveryLedger:
+        return DiscoveryLedger(user_task="Search for train tickets")
+
+    def test_creation(self) -> None:
+        ledger = self._make_ledger()
+        assert ledger.user_task == "Search for train tickets"
+        assert ledger.routine_specs == []
+        assert ledger.experiments == []
+        assert ledger.attempts == []
+        assert ledger.catalog is None
+
+    # --- Spec management ---
+
+    def test_add_spec(self) -> None:
+        ledger = self._make_ledger()
+        spec = RoutineSpec(name="search", description="Search trains")
+        result = ledger.add_spec(spec)
+        assert result is spec
+        assert len(ledger.routine_specs) == 1
+
+    def test_get_spec(self) -> None:
+        ledger = self._make_ledger()
+        spec = RoutineSpec(name="search", description="Search trains")
+        ledger.add_spec(spec)
+        assert ledger.get_spec(spec.id) is spec
+
+    def test_get_spec_not_found(self) -> None:
+        ledger = self._make_ledger()
+        assert ledger.get_spec("nonexistent") is None
+
+    def test_get_active_spec(self) -> None:
+        ledger = self._make_ledger()
+        spec = RoutineSpec(name="search", description="Search")
+        ledger.add_spec(spec)
+        ledger.active_spec_id = spec.id
+        assert ledger.get_active_spec() is spec
+
+    def test_get_active_spec_none(self) -> None:
+        ledger = self._make_ledger()
+        assert ledger.get_active_spec() is None
+
+    # --- Experiment management ---
+
+    def test_add_experiment(self) -> None:
+        ledger = self._make_ledger()
+        exp = ExperimentEntry(hypothesis="h", rationale="r", methodology="m")
+        result = ledger.add_experiment(exp)
+        assert result is exp
+        assert len(ledger.experiments) == 1
+
+    def test_add_experiment_links_to_spec(self) -> None:
+        ledger = self._make_ledger()
+        spec = RoutineSpec(name="s", description="d")
+        ledger.add_spec(spec)
+        exp = ExperimentEntry(
+            hypothesis="h",
+            rationale="r",
+            methodology="m",
+            routine_spec_id=spec.id,
+        )
+        ledger.add_experiment(exp)
+        assert exp.id in spec.experiment_ids
+
+    def test_get_experiment(self) -> None:
+        ledger = self._make_ledger()
+        exp = ExperimentEntry(hypothesis="h", rationale="r", methodology="m")
+        ledger.add_experiment(exp)
+        assert ledger.get_experiment(exp.id) is exp
+
+    def test_get_experiment_not_found(self) -> None:
+        ledger = self._make_ledger()
+        assert ledger.get_experiment("nope") is None
+
+    def test_get_running_experiments(self) -> None:
+        ledger = self._make_ledger()
+        e1 = ExperimentEntry(hypothesis="h1", rationale="r", methodology="m", status=ExperimentStatus.RUNNING)
+        e2 = ExperimentEntry(hypothesis="h2", rationale="r", methodology="m", status=ExperimentStatus.DONE)
+        ledger.add_experiment(e1)
+        ledger.add_experiment(e2)
+        assert len(ledger.get_running_experiments()) == 1
+
+    def test_get_confirmed_experiments(self) -> None:
+        ledger = self._make_ledger()
+        e1 = ExperimentEntry(hypothesis="h1", rationale="r", methodology="m", verdict=ExperimentVerdict.CONFIRMED)
+        e2 = ExperimentEntry(hypothesis="h2", rationale="r", methodology="m", verdict=ExperimentVerdict.REFUTED)
+        ledger.add_experiment(e1)
+        ledger.add_experiment(e2)
+        assert len(ledger.get_confirmed_experiments()) == 1
+
+    def test_get_experiments_for_spec(self) -> None:
+        ledger = self._make_ledger()
+        spec = RoutineSpec(name="s", description="d")
+        ledger.add_spec(spec)
+        exp = ExperimentEntry(
+            hypothesis="h",
+            rationale="r",
+            methodology="m",
+            routine_spec_id=spec.id,
+        )
+        ledger.add_experiment(exp)
+        result = ledger.get_experiments_for_spec(spec.id)
+        assert len(result) == 1
+        assert result[0] is exp
+
+    def test_get_experiments_for_nonexistent_spec(self) -> None:
+        ledger = self._make_ledger()
+        assert ledger.get_experiments_for_spec("nope") == []
+
+    # --- Attempt management ---
+
+    def test_add_attempt(self) -> None:
+        ledger = self._make_ledger()
+        spec = RoutineSpec(name="s", description="d")
+        ledger.add_spec(spec)
+        attempt = RoutineAttempt(
+            routine_spec_id=spec.id,
+            routine_json={"name": "test", "operations": []},
+        )
+        ledger.add_attempt(attempt)
+        assert len(ledger.attempts) == 1
+        assert attempt.id in spec.attempt_ids
+
+    def test_get_attempt(self) -> None:
+        ledger = self._make_ledger()
+        attempt = RoutineAttempt(routine_spec_id="x", routine_json={})
+        ledger.attempts.append(attempt)
+        assert ledger.get_attempt(attempt.id) is attempt
+
+    def test_get_attempt_not_found(self) -> None:
+        ledger = self._make_ledger()
+        assert ledger.get_attempt("nope") is None
+
+    def test_get_attempts_for_spec(self) -> None:
+        ledger = self._make_ledger()
+        a1 = RoutineAttempt(routine_spec_id="spec1", routine_json={})
+        a2 = RoutineAttempt(routine_spec_id="spec2", routine_json={})
+        a3 = RoutineAttempt(routine_spec_id="spec1", routine_json={})
+        ledger.attempts.extend([a1, a2, a3])
+        result = ledger.get_attempts_for_spec("spec1")
+        assert len(result) == 2
+
+    # --- Summary rendering ---
+
+    def test_to_summary_empty(self) -> None:
+        ledger = self._make_ledger()
+        assert ledger.to_summary() == "(no activity yet)"
+
+    def test_to_summary_with_specs(self) -> None:
+        ledger = self._make_ledger()
+        spec = RoutineSpec(name="search_trains", description="Search for trains")
+        ledger.add_spec(spec)
+        summary = ledger.to_summary()
+        assert "Routine Catalog Plan" in summary
+        assert "search_trains" in summary
+
+    def test_to_summary_with_active_spec(self) -> None:
+        ledger = self._make_ledger()
+        spec = RoutineSpec(name="search", description="Search")
+        ledger.add_spec(spec)
+        ledger.active_spec_id = spec.id
+        summary = ledger.to_summary()
+        assert "ACTIVE" in summary
+
+    def test_to_summary_with_shipped(self) -> None:
+        ledger = self._make_ledger()
+        spec = RoutineSpec(name="search", description="Search", status=RoutineSpecStatus.SHIPPED)
+        ledger.add_spec(spec)
+        summary = ledger.to_summary()
+        assert "Shipped Routines" in summary
+
+    def test_to_summary_with_experiments(self) -> None:
+        ledger = self._make_ledger()
+        exp = ExperimentEntry(
+            hypothesis="API uses Bearer auth",
+            rationale="r",
+            methodology="m",
+            verdict=ExperimentVerdict.CONFIRMED,
+            summary="Confirmed",
+        )
+        ledger.add_experiment(exp)
+        summary = ledger.to_summary()
+        assert "Experiment History" in summary
+        assert "API uses Bearer auth" in summary
+
+    def test_to_summary_with_proven_artifacts(self) -> None:
+        ledger = self._make_ledger()
+        ledger.proven.fetches.append({"method": "POST", "url": "/api/search"})
+        summary = ledger.to_summary()
+        assert "Proven Artifacts" in summary
+        assert "FETCH" in summary
+
+    def test_to_summary_with_failed_attempt(self) -> None:
+        ledger = self._make_ledger()
+        spec = RoutineSpec(name="search", description="Search")
+        ledger.add_spec(spec)
+        attempt = RoutineAttempt(
+            routine_spec_id=spec.id,
+            routine_json={},
+            status=RoutineAttemptStatus.FAILED,
+            blocking_issues=["Missing auth"],
+        )
+        ledger.add_attempt(attempt)
+        summary = ledger.to_summary()
+        assert "LAST FAILURE" in summary
diff --git a/tests/unit/data_models/orchestration/test_result.py b/tests/unit/data_models/orchestration/test_result.py
new file mode 100644
index 00000000..3ae5c5e4
--- /dev/null
+++ b/tests/unit/data_models/orchestration/test_result.py
@@ -0,0 +1,47 @@
+"""
+tests/unit/data_models/orchestration/test_result.py
+
+Tests for SpecialistResultWrapper.
+"""
+
+from bluebox.data_models.orchestration.result import SpecialistResultWrapper
+
+
+class TestSpecialistResultWrapper:
+    """Tests for SpecialistResultWrapper."""
+
+    def test_defaults(self) -> None:
+        wrapper = SpecialistResultWrapper()
+        assert wrapper.output is None
+        assert wrapper.success is True
+        assert wrapper.notes == []
+        assert wrapper.failure_reason is None
+
+    def test_success_with_output(self) -> None:
+        wrapper = SpecialistResultWrapper(
+            output={"endpoints": ["/api/search"], "auth": "Bearer"},
+            success=True,
+            notes=["Found 3 endpoints, filtered to 1"],
+        )
+        assert wrapper.output["endpoints"] == ["/api/search"]
+        assert len(wrapper.notes) == 1
+
+    def test_failure(self) -> None:
+        wrapper = SpecialistResultWrapper(
+            success=False,
+            failure_reason="Timeout after 60 seconds",
+            notes=["Started processing", "Reached max iterations"],
+        )
+        assert wrapper.success is False
+        assert "Timeout" in wrapper.failure_reason
+        assert len(wrapper.notes) == 2
+
+    def test_roundtrip(self) -> None:
+        wrapper = SpecialistResultWrapper(
+            output={"key": "value"},
+            notes=["note1"],
+        )
+        data = wrapper.model_dump()
+        wrapper2 = SpecialistResultWrapper.model_validate(data)
+        assert wrapper2.output == wrapper.output
+        assert wrapper2.notes == wrapper.notes
diff --git a/tests/unit/data_models/orchestration/test_state.py b/tests/unit/data_models/orchestration/test_state.py
new file mode 100644
index 00000000..86eb4a41
--- /dev/null
+++ b/tests/unit/data_models/orchestration/test_state.py
@@ -0,0 +1,89 @@
+"""
+tests/unit/data_models/orchestration/test_state.py
+
+Tests for AgentOrchestrationState.
+"""
+
+from bluebox.data_models.orchestration.state import AgentOrchestrationState
+from bluebox.data_models.orchestration.task import (
+    SpecialistAgentType,
+    SubAgent,
+    Task,
+    TaskStatus,
+)
+
+
+class TestAgentOrchestrationState:
+    """Tests for AgentOrchestrationState."""
+
+    def test_defaults(self) -> None:
+        state = AgentOrchestrationState()
+        assert state.tasks == {}
+        assert state.subagents == {}
+
+    def test_add_task(self) -> None:
+        state = AgentOrchestrationState()
+        task = Task(agent_type=SpecialistAgentType.NETWORK_SPECIALIST, prompt="test")
+        result = state.add_task(task)
+        assert result is task
+        assert task.id in state.tasks
+
+    def test_get_pending_tasks(self) -> None:
+        state = AgentOrchestrationState()
+        t1 = Task(agent_type=SpecialistAgentType.NETWORK_SPECIALIST, prompt="t1")
+        t2 = Task(agent_type=SpecialistAgentType.DOM_SPECIALIST, prompt="t2", status=TaskStatus.IN_PROGRESS)
+        t3 = Task(agent_type=SpecialistAgentType.JS_SPECIALIST, prompt="t3")
+        state.add_task(t1)
+        state.add_task(t2)
+        state.add_task(t3)
+        pending = state.get_pending_tasks()
+        assert len(pending) == 2
+
+    def test_get_in_progress_tasks(self) -> None:
+        state = AgentOrchestrationState()
+        t1 = Task(agent_type=SpecialistAgentType.NETWORK_SPECIALIST, prompt="t", status=TaskStatus.IN_PROGRESS)
+        state.add_task(t1)
+        assert len(state.get_in_progress_tasks()) == 1
+
+    def test_get_paused_tasks(self) -> None:
+        state = AgentOrchestrationState()
+        t1 = Task(agent_type=SpecialistAgentType.NETWORK_SPECIALIST, prompt="t", status=TaskStatus.PAUSED)
+        state.add_task(t1)
+        assert len(state.get_paused_tasks()) == 1
+
+    def test_get_completed_tasks(self) -> None:
+        state = AgentOrchestrationState()
+        t1 = Task(agent_type=SpecialistAgentType.NETWORK_SPECIALIST, prompt="t", status=TaskStatus.COMPLETED)
+        state.add_task(t1)
+        assert len(state.get_completed_tasks()) == 1
+
+    def test_get_failed_tasks(self) -> None:
+        state = AgentOrchestrationState()
+        t1 = Task(agent_type=SpecialistAgentType.NETWORK_SPECIALIST, prompt="t", status=TaskStatus.FAILED)
+        state.add_task(t1)
+        assert len(state.get_failed_tasks()) == 1
+
+    def test_get_queue_status(self) -> None:
+        state = AgentOrchestrationState()
+        state.add_task(Task(agent_type=SpecialistAgentType.NETWORK_SPECIALIST, prompt="p"))
+        state.add_task(Task(agent_type=SpecialistAgentType.DOM_SPECIALIST, prompt="p", status=TaskStatus.IN_PROGRESS))
+        state.add_task(Task(agent_type=SpecialistAgentType.JS_SPECIALIST, prompt="p", status=TaskStatus.COMPLETED))
+        state.add_task(Task(agent_type=SpecialistAgentType.EXPERIMENT_WORKER, prompt="p", status=TaskStatus.FAILED))
+
+        status = state.get_queue_status()
+        assert status["pending_tasks"] == 1
+        assert status["in_progress_tasks"] == 1
+        assert status["completed_tasks"] == 1
+        assert status["failed_tasks"] == 1
+        assert status["paused_tasks"] == 0
+
+    def test_reset(self) -> None:
+        state = AgentOrchestrationState()
+        state.add_task(Task(agent_type=SpecialistAgentType.NETWORK_SPECIALIST, prompt="t"))
+        state.subagents["agent1"] = SubAgent(
+            type=SpecialistAgentType.NETWORK_SPECIALIST,
+            llm_model="gpt-5.2",
+        )
+        state.reset()
+        assert state.tasks == {}
+        assert state.subagents == {}
diff --git a/tests/unit/data_models/orchestration/test_task.py b/tests/unit/data_models/orchestration/test_task.py
new file mode 100644
index 00000000..221399b0
--- /dev/null
+++ b/tests/unit/data_models/orchestration/test_task.py
@@ -0,0 +1,133 @@
+"""
+tests/unit/data_models/orchestration/test_task.py
+
+Tests for Task, SubAgent, TaskStatus, SpecialistAgentType, generate_short_id.
+"""
+
+import pytest
+
+from bluebox.data_models.orchestration.task import (
+    SubAgent,
+    SpecialistAgentType,
+    Task,
+    TaskStatus,
+    generate_short_id,
+)
+
+
+class TestGenerateShortId:
+    """Tests for generate_short_id."""
+
+    def test_default_length(self) -> None:
+        assert len(generate_short_id()) == 6
+
+    def test_custom_length(self) -> None:
+        assert len(generate_short_id(length=10)) == 10
+
+    def test_no_ambiguous_chars(self) -> None:
+        for _ in range(100):
+            id_ = generate_short_id()
+            for ch in "0o1l":
+                assert ch not in id_
+
+    def test_uniqueness(self) -> None:
+        ids = {generate_short_id() for _ in range(100)}
+        assert len(ids) == 100
+
+    def test_lowercase_alphanumeric(self) -> None:
+        for _ in range(50):
+            id_ = generate_short_id()
+            assert id_.isalnum()
+
+
+class TestTaskStatus:
+    """Tests for TaskStatus enum."""
+
+    def test_all_values(self) -> None:
+        expected = {"pending", "in_progress", "paused", "completed", "failed"}
+        assert {s.value for s in TaskStatus} == expected
+
+
+class TestSpecialistAgentType:
+    """Tests for SpecialistAgentType enum."""
+
+    def test_pipeline_types_exist(self) -> None:
+        assert SpecialistAgentType.NETWORK_SPECIALIST == "network_specialist"
+        assert SpecialistAgentType.DOM_SPECIALIST == "dom_specialist"
+        assert SpecialistAgentType.EXPERIMENT_WORKER == "experiment_worker"
+        assert SpecialistAgentType.ROUTINE_INSPECTOR == "routine_inspector"
+        assert SpecialistAgentType.VALUE_TRACE_RESOLVER == "value_trace_resolver"
+
+
+class TestTask:
+    """Tests for Task model."""
+
+    def test_defaults(self) -> None:
+        task = Task(
+            agent_type=SpecialistAgentType.NETWORK_SPECIALIST,
+            prompt="Analyze the /api/search endpoint",
+        )
+        assert len(task.id) == 6
+        assert task.status == TaskStatus.PENDING
+        assert task.result is None
+        assert task.error is None
+        assert task.max_loops == 5
+        assert task.loops_used == 0
+        assert task.context == {}
+        assert task.output_schema is None
+
+    def test_unique_ids(self) -> None:
+        tasks = [
+            Task(agent_type=SpecialistAgentType.DOM_SPECIALIST, prompt="t")
+            for _ in range(20)
+        ]
+        assert len({t.id for t in tasks}) == 20
+
+    def test_with_context_and_schema(self) -> None:
+        task = Task(
+            agent_type=SpecialistAgentType.EXPERIMENT_WORKER,
+            prompt="Run experiment",
+            context={"endpoint": "/api/search", "method": "POST"},
+            output_schema={"type": "object", "properties": {"success": {"type": "boolean"}}},
+            output_description="Whether the experiment succeeded",
+        )
+        assert task.context["endpoint"] == "/api/search"
+        assert task.output_schema is not None
+
+    def test_model_dump_roundtrip(self) -> None:
+        task = Task(
+            agent_type=SpecialistAgentType.ROUTINE_INSPECTOR,
+            prompt="Inspect routine",
+            max_loops=10,
+        )
+        data = task.model_dump()
+        task2 = Task.model_validate(data)
+        assert task2.agent_type == task.agent_type
+        assert task2.max_loops == 10
+
+
+class TestSubAgent:
+    """Tests for SubAgent model."""
+
+    def test_defaults(self) -> None:
+        agent = SubAgent(
+            type=SpecialistAgentType.NETWORK_SPECIALIST,
+            llm_model="gpt-5.2",
+        )
+        assert len(agent.id) == 6
+        assert agent.task_ids == []
+
+    def test_task_tracking(self) -> None:
+        agent = SubAgent(
+            type=SpecialistAgentType.DOM_SPECIALIST,
+            llm_model="gpt-5.2",
+            task_ids=["abc", "def"],
+        )
+        assert len(agent.task_ids) == 2
+
+    def test_unique_ids(self) -> None:
+        agents = [
+            SubAgent(type=SpecialistAgentType.JS_SPECIALIST, llm_model="gpt-5.2")
+            for _ in range(20)
+        ]
+        assert len({a.id for a in agents}) == 20
diff --git a/tests/unit/llms/test_anthropic_client.py b/tests/unit/llms/test_anthropic_client.py
index afef83a1..3f48b892 100644
--- a/tests/unit/llms/test_anthropic_client.py
+++ b/tests/unit/llms/test_anthropic_client.py
@@ -548,8 +548,8 @@ def test_normalize_none(self, client: AnthropicClient) -> None:
 
     def test_normalize_other_tool_name(self, client: AnthropicClient) -> None:
         """Test that other tool names normalize correctly."""
-        result = client._normalize_tool_choice("search_docs")
-        assert result == {"type": "tool", "name": "search_docs"}
+        result = client._normalize_tool_choice("search_files")
+        assert result == {"type": "tool", "name": "search_files"}
 
     def test_normalize_none(self, client: AnthropicClient) -> None:
         """Test that 'none' normalizes to {'type': 'none'}."""
diff --git a/tests/unit/llms/test_openai_client.py b/tests/unit/llms/test_openai_client.py
index 9e170cfc..aac92594 100644
--- a/tests/unit/llms/test_openai_client.py
+++ b/tests/unit/llms/test_openai_client.py
@@ -194,8 +194,8 @@ def test_normalize_none(self, client: OpenAIClient) -> None:
 
     def test_normalize_other_tool_name(self, client: OpenAIClient) -> None:
         """Test that other tool names normalize correctly."""
-        result = client._normalize_tool_choice("search_docs")
-        assert result == {"type": "function", "name": "search_docs"}
+        result = client._normalize_tool_choice("search_files")
+        assert result == {"type": "function", "name": "search_files"}
 
     def test_normalize_none(self, client: OpenAIClient) -> None:
         """Test that 'none' normalizes to 'none'."""
diff --git a/tests/unit/test_blocklist_hints.py b/tests/unit/test_blocklist_hints.py
index 612f2f49..c49e9f2b 100644
--- a/tests/unit/test_blocklist_hints.py
+++ b/tests/unit/test_blocklist_hints.py
@@ -4,10 +4,12 @@
 Unit tests for blocklist workaround hints and sandbox-aware system prompts.
 """
 
+from pathlib import Path
 from unittest.mock import MagicMock, patch
 
 import pytest
 
+from bluebox.workspace import LocalAgentWorkspace
 from bluebox.utils.code_execution_sandbox import (
     BLOCKED_MODULE_WORKAROUNDS,
     BLOCKED_PATTERN_WORKAROUNDS,
@@ -123,44 +125,61 @@ def test_lambda_mode(self) -> None:
 class TestBlueBoxAgentBlocklistPrompt:
     """Tests for blocklist-aware system prompt in BlueBoxAgent."""
 
-    @patch("bluebox.agents.bluebox_agent.get_active_sandbox_mode", return_value="blocklist")
+    @patch("bluebox.agents.abstract_agent.get_active_sandbox_mode", return_value="blocklist")
     @patch("bluebox.agents.bluebox_agent.Config")
-    def test_blocklist_prompt_included(self, mock_config: MagicMock, mock_mode: MagicMock) -> None:
+    def test_blocklist_prompt_included(
+        self,
+        mock_config: MagicMock,
+        mock_mode: MagicMock,
+        tmp_path: Path,
+    ) -> None:
         """System prompt should include sandbox restrictions in blocklist mode."""
         mock_config.VECTORLY_SERVICE_TOKEN = "test-token"
         mock_config.VECTORLY_API_BASE = "http://test"
 
         from bluebox.agents.bluebox_agent import BlueBoxAgent
-        agent = BlueBoxAgent(emit_message_callable=MagicMock())
+        agent = BlueBoxAgent(
+            emit_message_callable=MagicMock(),
+            workspace=LocalAgentWorkspace.from_directory_path(tmp_path),
+        )
         prompt = agent._get_system_prompt()
         assert "Sandbox Restrictions" in prompt
         assert "Blocked imports" in prompt
         assert "do NOT import" in prompt
 
-    @patch("bluebox.agents.bluebox_agent.get_active_sandbox_mode", return_value="docker")
+    @patch("bluebox.agents.abstract_agent.get_active_sandbox_mode", return_value="docker")
     @patch("bluebox.agents.bluebox_agent.Config")
-    def test_blocklist_prompt_excluded(self, mock_config: MagicMock, mock_mode: MagicMock) -> None:
+    def test_blocklist_prompt_excluded(
+        self,
+        mock_config: MagicMock,
+        mock_mode: MagicMock,
+        tmp_path: Path,
+    ) -> None:
         """System prompt should NOT include sandbox restrictions when Docker is available."""
         mock_config.VECTORLY_SERVICE_TOKEN = "test-token"
         mock_config.VECTORLY_API_BASE = "http://test"
 
         from bluebox.agents.bluebox_agent import BlueBoxAgent
-        agent = BlueBoxAgent(emit_message_callable=MagicMock())
+        agent = BlueBoxAgent(
+            emit_message_callable=MagicMock(),
+            workspace=LocalAgentWorkspace.from_directory_path(tmp_path),
+        )
         prompt = agent._get_system_prompt()
         assert "Sandbox Restrictions" not in prompt
 
 
-class TestRunPythonCodeBlockedHint:
-    """Tests for workaround hints in _run_python_code error responses."""
+class TestExecutePythonBlockedHint:
+    """Tests for workaround hints in execute_python error responses."""
 
-    @patch("bluebox.agents.bluebox_agent.get_active_sandbox_mode", return_value="blocklist")
-    @patch("bluebox.agents.bluebox_agent.execute_python_sandboxed")
+    @patch("bluebox.agents.abstract_agent.get_active_sandbox_mode", return_value="blocklist")
+    @patch("bluebox.agents.abstract_agent.execute_python_sandboxed")
     @patch("bluebox.agents.bluebox_agent.Config")
     def test_blocked_module_returns_workaround_hint(
         self,
         mock_config: MagicMock,
         mock_sandbox: MagicMock,
         mock_mode: MagicMock,
+        tmp_path: Path,
     ) -> None:
         """When sandbox returns a blocked module error, _hint should contain workaround."""
         mock_config.VECTORLY_SERVICE_TOKEN = "test-token"
@@ -171,19 +190,27 @@ def test_blocked_module_returns_workaround_hint(
         }
 
         from bluebox.agents.bluebox_agent import BlueBoxAgent
-        agent = BlueBoxAgent(emit_message_callable=MagicMock())
-        result = agent._run_python_code("import os")
+        agent = BlueBoxAgent(
+            emit_message_callable=MagicMock(),
+            workspace=LocalAgentWorkspace.from_directory_path(tmp_path),
+        )
+        result = agent._execute_tool("execute_python", {"code": "import os"})
+        kwargs = mock_sandbox.call_args.kwargs
+        read_only_paths = kwargs["read_only_paths"]
+        assert any(p.endswith("/raw") for p in read_only_paths)
+        assert any(p.endswith("/meta") for p in read_only_paths)
         assert "Sandbox restriction:" in result["_hint"]
         assert "open()" in result["_hint"]
 
-    @patch("bluebox.agents.bluebox_agent.get_active_sandbox_mode", return_value="blocklist")
-    @patch("bluebox.agents.bluebox_agent.execute_python_sandboxed")
+    @patch("bluebox.agents.abstract_agent.get_active_sandbox_mode", return_value="blocklist")
+    @patch("bluebox.agents.abstract_agent.execute_python_sandboxed")
     @patch("bluebox.agents.bluebox_agent.Config")
     def test_generic_error_returns_generic_hint(
         self,
         mock_config: MagicMock,
         mock_sandbox: MagicMock,
         mock_mode: MagicMock,
+        tmp_path: Path,
     ) -> None:
         """When sandbox returns a non-blocklist error, _hint should be generic."""
         mock_config.VECTORLY_SERVICE_TOKEN = "test-token"
@@ -194,7 +221,10 @@ def test_generic_error_returns_generic_hint(
         }
 
         from bluebox.agents.bluebox_agent import BlueBoxAgent
-        agent = BlueBoxAgent(emit_message_callable=MagicMock())
-        result = agent._run_python_code("print(foo)")
+        agent = BlueBoxAgent(
+            emit_message_callable=MagicMock(),
+            workspace=LocalAgentWorkspace.from_directory_path(tmp_path),
+        )
+        result = agent._execute_tool("execute_python", {"code": "print(foo)"})
         assert "Sandbox restriction:" not in result["_hint"]
         assert "Code failed" in result["_hint"]
diff --git a/tests/unit/test_code_execution_sandbox.py b/tests/unit/test_code_execution_sandbox.py
index 5f6d4a6f..e2091d0a 100644
--- a/tests/unit/test_code_execution_sandbox.py
+++ b/tests/unit/test_code_execution_sandbox.py
@@ -1030,6 +1030,25 @@ def test_allows_write(self, tmp_path: object) -> None:
         with open(os.path.join(work_dir, "output.csv")) as f:
             assert f.read() == "a,b,c\n1,2,3\n"
 
+    def test_blocks_symlink_escape(self, tmp_path: Path) -> None:
+        """Should block writing through a symlink that points outside work_dir."""
+        if not hasattr(os, "symlink"):
+            pytest.skip("symlink not supported on this platform")
+
+        work_dir = tmp_path / "workspace"
+        outside_dir = tmp_path / "outside"
+        work_dir.mkdir()
+        outside_dir.mkdir()
+
+        link_path = work_dir / "escape_link.txt"
+        outside_target = outside_dir / "target.txt"
+        outside_target.write_text("seed")
+        os.symlink(outside_target, link_path)
+
+        scoped = _create_scoped_open(str(work_dir))
+        with pytest.raises(PermissionError, match="outside the working directory"):
+            scoped("escape_link.txt", "w")
+
 
 class TestBlocklistSandboxWorkDir:
     """Tests for blocklist sandbox with work_dir parameter."""
@@ -1122,6 +1141,18 @@ def test_can_write_to_subdirectory(self, tmp_path: object) -> None:
         assert "done" in result["output"]
         assert os.path.exists(os.path.join(work_dir, "outputs", "result.txt"))
 
+    def test_can_write_with_path_write_text(self, tmp_path: object) -> None:
+        """Scoped Path.write_text should allow writes within work_dir."""
+        work_dir = str(tmp_path)
+        code = """
+Path("output.txt").write_text("hello")
+print("done")
+"""
+        result = _execute_blocklist_sandbox(code, work_dir=work_dir)
+        assert "error" not in result
+        assert "done" in result["output"]
+        assert os.path.exists(os.path.join(work_dir, "output.txt"))
+
     def test_csv_module_available(self, tmp_path: object) -> None:
         """csv module should be pre-loaded when work_dir is set."""
         work_dir = str(tmp_path)
@@ -1150,6 +1181,67 @@ def test_blocks_path_traversal_write(self, tmp_path: object) -> None:
         assert "error" in result
         assert "outside" in result["error"].lower() or "denied" in result["error"].lower()
 
+    def test_blocks_pathlib_path_traversal_write(self, tmp_path: object) -> None:
+        """Should block Path.write_text traversal outside work_dir."""
+        work_dir = str(tmp_path / "workspace")
+        os.makedirs(work_dir)
+        code = """
+Path("../../escape.txt").write_text("escaped")
+"""
+        result = _execute_blocklist_sandbox(code, work_dir=work_dir)
+        assert "error" in result
+        assert "outside" in result["error"].lower() or "denied" in result["error"].lower()
+
+    def test_blocks_write_in_read_only_subdir(self, tmp_path: object) -> None:
+        """Should block writes under configured read_only_paths."""
+        work_dir = str(tmp_path)
+        os.makedirs(os.path.join(work_dir, "raw"), exist_ok=True)
+        code = """
+with open("raw/blocked.txt", "w") as f:
+    f.write("blocked")
+"""
+        result = _execute_blocklist_sandbox(
+            code,
+            work_dir=work_dir,
+            read_only_paths=[os.path.join(work_dir, "raw")],
+        )
+        assert "error" in result
+        assert "read-only" in result["error"].lower()
+
+    def test_blocks_pathlib_write_in_read_only_subdir(self, tmp_path: object) -> None:
+        """Should block Path.write_text under configured read_only_paths."""
+        work_dir = str(tmp_path)
+        os.makedirs(os.path.join(work_dir, "raw"), exist_ok=True)
+        code = """
+Path("raw/blocked.txt").write_text("blocked")
+"""
+        result = _execute_blocklist_sandbox(
+            code,
+            work_dir=work_dir,
+            read_only_paths=[os.path.join(work_dir, "raw")],
+        )
+        assert "error" in result
+        assert "read-only" in result["error"].lower()
+
+    def test_allows_read_in_read_only_subdir(self, tmp_path: object) -> None:
+        """Read access should still work for read_only_paths."""
+        work_dir = str(tmp_path)
+        raw_file = os.path.join(work_dir, "raw", "ok.txt")
+        os.makedirs(os.path.dirname(raw_file), exist_ok=True)
+        with open(raw_file, "w") as f:
+            f.write("ok")
+        code = """
+with open("raw/ok.txt", "r") as f:
+    print(f.read())
+"""
+        result = _execute_blocklist_sandbox(
+            code,
+            work_dir=work_dir,
+            read_only_paths=[os.path.join(work_dir, "raw")],
+        )
+        assert "error" not in result
+        assert "ok" in result["output"]
+
     def test_extra_globals_with_work_dir(self, tmp_path: object) -> None:
         """Extra globals should work alongside work_dir."""
         work_dir = str(tmp_path)
@@ -1230,6 +1322,25 @@ def test_lambda_mode_rejects_work_dir(self) -> None:
         finally:
             sandbox_module.SANDBOX_MODE = original_mode
 
+    def test_rejects_read_only_paths_without_work_dir(self) -> None:
+        """read_only_paths should require work_dir."""
+        result = execute_python_sandboxed(
+            "print('x')",
+            read_only_paths=["/tmp/some-path"],
+        )
+        assert "error" in result
+        assert "requires work_dir" in result["error"]
+
+    def test_rejects_read_only_path_outside_work_dir(self, tmp_path: Path) -> None:
+        """read_only_paths must stay under work_dir."""
+        result = execute_python_sandboxed(
+            "print('x')",
+            work_dir=str(tmp_path),
+            read_only_paths=["/etc"],
+        )
+        assert "error" in result
+        assert "outside work_dir" in result["error"]
+
     def test_docker_mode_passes_work_dir(self) -> None:
         """Docker mode should pass work_dir to _execute_in_docker."""
         import bluebox.utils.code_execution_sandbox as sandbox_module
@@ -1367,3 +1478,22 @@ def test_docker_wrapper_no_csv_without_work_dir(self) -> None:
             wrapper_script = mock_run.call_args[1]["input"]
             assert "import json, base64, sys" in wrapper_script
             assert "from pathlib import Path" not in wrapper_script
+
+    def test_docker_mounts_read_only_paths(self, tmp_path: Path) -> None:
+        """Read-only paths should be mounted under /data with :ro."""
+        mock_result = MagicMock()
+        mock_result.returncode = 0
+        mock_result.stdout = "ok\n"
+        mock_result.stderr = ""
+
+        raw_dir = tmp_path / "raw"
+        raw_dir.mkdir(parents=True, exist_ok=True)
+
+        with patch("subprocess.run", return_value=mock_result) as mock_run:
+            _execute_in_docker(
+                "print('ok')",
+                work_dir=str(tmp_path),
+                read_only_paths=[str(raw_dir)],
+            )
+            docker_cmd = mock_run.call_args[0][0]
+            assert any(arg.endswith("/raw:/data/raw:ro") for arg in docker_cmd)
diff --git a/tests/unit/test_read_workspace_file.py b/tests/unit/test_read_workspace_file.py
index a38a237f..cf75b388 100644
--- a/tests/unit/test_read_workspace_file.py
+++ b/tests/unit/test_read_workspace_file.py
@@ -1,23 +1,23 @@
 """
 tests/unit/test_read_workspace_file.py
 
-Unit tests for the path traversal fix in LocalWorkspace.read_file.
-Tests the method directly using a LocalWorkspace instance.
+Unit tests for the path traversal fix in LocalAgentWorkspace.read_file.
+Tests the method directly using a LocalAgentWorkspace instance.
 """
 
 from pathlib import Path
 
 import pytest
 
-from bluebox.agents.workspace import LocalWorkspace
+from bluebox.workspace import LocalAgentWorkspace
 
 
-def _make_workspace(workspace_dir: Path) -> LocalWorkspace:
-    """Create a LocalWorkspace for testing."""
-    return LocalWorkspace(str(workspace_dir))
+def _make_workspace(workspace_dir: Path) -> LocalAgentWorkspace:
+    """Create a LocalAgentWorkspace for testing."""
+    return LocalAgentWorkspace(str(workspace_dir))
 
 
-def _call(ws: LocalWorkspace, path: str, **kwargs: object) -> dict:
+def _call(ws: LocalAgentWorkspace, path: str, **kwargs: object) -> dict:
     """Call read_file on the workspace."""
     return ws.read_file(path, **kwargs)
 
diff --git a/tests/unit/test_workspace.py b/tests/unit/test_workspace.py
index 49df8612..dd3c68b0 100644
--- a/tests/unit/test_workspace.py
+++ b/tests/unit/test_workspace.py
@@ -1,59 +1,215 @@
 """
 tests/unit/test_workspace.py
 
-Unit tests for AgentWorkspace / LocalWorkspace.
+Unit tests for AgentWorkspace / LocalAgentWorkspace.
 """
 
 from __future__ import annotations
 
+import errno
 import json
+import os
 import time
 from pathlib import Path
 
 import pytest
 
-from bluebox.agents.workspace import LocalWorkspace
+from bluebox.workspace import (
+    LocalAgentWorkspace,
+)
+
+
+class TestConstructors:
+    def test_from_directory_path_constructor(self, tmp_path: Path) -> None:
+        ws_dir = tmp_path / "resumed_workspace"
+        ws_dir.mkdir()
+
+        ws = LocalAgentWorkspace.from_directory_path(ws_dir)
+
+        assert ws.root_path == ws_dir
+        assert (ws_dir / "raw").is_dir()
+        assert (ws_dir / "output").is_dir()
+        assert (ws_dir / "context").is_dir()
+        assert (ws_dir / "scratch").is_dir()
+        assert (ws_dir / "meta").is_dir()
+
+
+class TestSaveArtifact:
+    """Tests for the v2 save_artifact method."""
+
+    def test_creates_artifact_with_ref(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        ref = ws.save_artifact("raw", "data.json", '{"x": 1}')
+        assert ref.artifact_id == "a_000001"
+        assert ref.source == "raw"
+        assert ref.content_type == "json"
+        assert ref.size_bytes > 0
+        assert ref.sha256 is not None
+
+    def test_monotonic_ids(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        r1 = ws.save_artifact("raw", "a.json", "{}")
+        r2 = ws.save_artifact("output", "b.csv", "a,b")
+        r3 = ws.save_artifact("context", "c.md", "# C")
+        assert r1.index == 1
+        assert r2.index == 2
+        assert r3.index == 3
+
+    def test_manifest_written(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        ws.save_artifact("raw", "data.json", '{"x": 1}', tool_name="my_tool")
+        manifest = (tmp_path / "meta" / "manifest.jsonl").read_text()
+        lines = [l for l in manifest.strip().splitlines() if l.strip()]
+        assert len(lines) == 1
+        entry = json.loads(lines[0])
+        assert entry["tool_name"] == "my_tool"
+        assert entry["artifact"]["artifact_id"] == "a_000001"
+
+    def test_source_routing(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        r1 = ws.save_artifact("raw", "a.json", "{}")
+        r2 = ws.save_artifact("output", "b.csv", "a,b")
+        r3 = ws.save_artifact("context", "c.md", "# C")
+        assert r1.relative_path.startswith("raw/")
+        assert r2.relative_path.startswith("output/")
+        assert r3.relative_path.startswith("context/")
+
+    def test_binary_content(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        ref = ws.save_artifact("raw", "data.bin", b"\x00\x01\x02")
+        assert ref.content_type == "binary"
+        assert ref.size_bytes == 3
+
+    def test_custom_content_type(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        ref = ws.save_artifact("raw", "data.txt", "hello", content_type="csv")
+        assert ref.content_type == "csv"
+
+    def test_metadata_stored(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        ref = ws.save_artifact("raw", "d.json", "{}", metadata={"key": "val"})
+        assert ref.metadata == {"key": "val"}
+
+    def test_rejects_artifact_filename_with_path_separator(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        with pytest.raises(ValueError, match="Invalid filename"):
+            ws.save_artifact("raw", "../escape.json", "{}")
+
+    def test_rejects_artifact_filename_with_windows_separator(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        with pytest.raises(ValueError, match="Invalid filename"):
+            ws.save_artifact("raw", "..\\escape.json", "{}")
+
+
+class TestListArtifacts:
+    """Tests for list_artifacts."""
+
+    def test_empty(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        assert ws.list_artifacts() == []
+
+    def test_filter_by_source(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        ws.save_artifact("raw", "a.json", "{}")
+        ws.save_artifact("output", "b.csv", "a,b")
+        ws.save_artifact("raw", "c.json", "{}")
+        assert len(ws.list_artifacts("raw")) == 2
+        assert len(ws.list_artifacts("output")) == 1
+        assert len(ws.list_artifacts("context")) == 0
+        assert len(ws.list_artifacts()) == 3
+
+
+class TestMountedInputs:
+    """Tests for attach_input_file/list_mounted_inputs."""
+
+    def test_attach_input_file_creates_hardlink(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        source = tmp_path / "source.jsonl"
+        source.write_text('{"id": 1}\n')
+
+        ref = ws.attach_input_file("network_events", source)
+        target = tmp_path / ref.relative_path
+
+        assert target.exists()
+        assert ref.relative_path == "raw/network_events.jsonl"
+        assert target.read_text() == source.read_text()
+
+        src_stat = source.stat()
+        tgt_stat = target.stat()
+        assert src_stat.st_ino == tgt_stat.st_ino
+        assert src_stat.st_dev == tgt_stat.st_dev
+
+    def test_attach_input_file_rejects_invalid_name(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        source = tmp_path / "source.jsonl"
+        source.write_text("{}\n")
+
+        with pytest.raises(ValueError, match="Invalid mount name"):
+            ws.attach_input_file("../escape", source)
+
+    def test_attach_input_file_rejects_missing_source(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        with pytest.raises(FileNotFoundError):
+            ws.attach_input_file("network_events", tmp_path / "missing.jsonl")
+
+    def test_attach_input_file_rejects_cross_filesystem(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        source = tmp_path / "source.jsonl"
+        source.write_text("{}\n")
+
+        def _raise_exdev(src: str | Path, dst: str | Path) -> None:
+            raise OSError(errno.EXDEV, "Invalid cross-device link")
+
+        monkeypatch.setattr(os, "link", _raise_exdev)
+
+        with pytest.raises(ValueError, match="Cannot hardlink across filesystems"):
+            ws.attach_input_file("network_events", source)
+
+    def test_attach_input_file_rejects_conflicting_existing_target(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        source = tmp_path / "source.jsonl"
+        source.write_text('{"id": 1}\n')
+
+        target = tmp_path / "raw" / "network_events.jsonl"
+        target.parent.mkdir(parents=True, exist_ok=True)
+        target.write_text('{"id": 999}\n')
 
+        with pytest.raises(ValueError, match="already exists with different inode"):
+            ws.attach_input_file("network_events", source)
+
+    def test_list_mounted_inputs_reads_manifest(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        source = tmp_path / "source.jsonl"
+        source.write_text('{"id": 1}\n')
+        created = ws.attach_input_file("network_events", source)
 
-class TestSaveFile:
-    """Tests for LocalWorkspace.save_file."""
+        mounted = ws.list_mounted_inputs()
+        assert len(mounted) == 1
+        assert mounted[0].mount_id == created.mount_id
+        assert mounted[0].relative_path == "raw/network_events.jsonl"
 
-    def test_saves_file_with_content(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
-        result = ws.save_file("raw", "routine_result.json", '{"data": 1}')
-        assert "output_file" in result
-        saved = Path(result["output_file"])
-        assert saved.exists()
-        assert saved.read_text() == '{"data": 1}'
-        assert saved.name == "routine_result.json"
 
-    def test_creates_subdirectory(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
-        ws.save_file("custom_subdir", "test.json", "content")
-        assert (tmp_path / "custom_subdir").is_dir()
+class TestReadArtifact:
+    """Tests for read_artifact."""
 
-    def test_overwrites_existing_file(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
-        ws.save_file("raw", "test.json", "old")
-        ws.save_file("raw", "test.json", "new")
-        assert (tmp_path / "raw" / "test.json").read_text() == "new"
-
-    def test_different_extensions(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
-        result = ws.save_file("outputs", "result.md", "# Result")
-        assert result["output_file"].endswith(".md")
+    def test_read_existing(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        ref = ws.save_artifact("raw", "data.json", '{"x": 1}')
+        result = ws.read_artifact(ref.artifact_id)
+        assert "error" not in result
+        assert '{"x": 1}' in result["content"]
 
-    def test_no_s3_key_in_result(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
-        result = ws.save_file("raw", "test.json", "data")
-        assert "output_file_s3_key" not in result
+    def test_read_missing(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        result = ws.read_artifact("a_999999")
+        assert "error" in result
 
 
 class TestReadFile:
-    """Tests for LocalWorkspace.read_file."""
+    """Tests for LocalAgentWorkspace.read_file."""
 
     def test_read_existing_file(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
+        ws = LocalAgentWorkspace(str(tmp_path))
         (tmp_path / "test.txt").write_text("hello\nworld")
         result = ws.read_file("test.txt")
         assert "error" not in result
@@ -61,113 +217,175 @@ def test_read_existing_file(self, tmp_path: Path) -> None:
         assert result["path"] == "test.txt"
 
     def test_read_with_line_range(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
+        ws = LocalAgentWorkspace(str(tmp_path))
         (tmp_path / "data.txt").write_text("\n".join(f"line{i}" for i in range(1, 11)))
         result = ws.read_file("data.txt", start_line=3, end_line=5)
         assert result["content"] == "line3\nline4\nline5"
 
     def test_read_nonexistent_file(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
+        ws = LocalAgentWorkspace(str(tmp_path))
         result = ws.read_file("missing.txt")
         assert "error" in result
 
     def test_path_traversal_blocked(self, tmp_path: Path) -> None:
         workspace = tmp_path / "workspace"
         workspace.mkdir()
-        ws = LocalWorkspace(str(workspace))
+        ws = LocalAgentWorkspace(str(workspace))
         result = ws.read_file("../../../etc/passwd")
         assert "error" in result
         assert "Access denied" in result["error"]
 
 
 class TestListFiles:
-    """Tests for LocalWorkspace.list_files."""
+    """Tests for LocalAgentWorkspace.list_files."""
 
-    def test_empty_workspace(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
+    def test_workspace_has_manifest_file(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
         result = ws.list_files()
-        assert result["total_files"] == 0
+        # meta/ contains manifest.jsonl (empty but touched)
+        assert result["total_files"] >= 1
         assert "tree" in result
 
     def test_lists_files_in_subdirs(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
+        ws = LocalAgentWorkspace(str(tmp_path))
         (tmp_path / "raw" / "result.json").write_text("{}")
-        (tmp_path / "outputs" / "out.csv").write_text("a,b")
+        (tmp_path / "output" / "out.csv").write_text("a,b")
         result = ws.list_files()
-        assert result["total_files"] == 2
         assert "result.json" in result["tree"]
         assert "out.csv" in result["tree"]
-
-
-class TestLoadRawJson:
-    """Tests for LocalWorkspace.load_raw_json."""
-
-    def test_loads_json_files(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
-        raw = tmp_path / "raw"
-        (raw / "a.json").write_text('{"key": "a"}')
-        (raw / "b.json").write_text('{"key": "b"}')
-        results = ws.load_raw_json()
-        assert len(results) == 2
-        assert results[0]["key"] == "a"
-        assert results[1]["key"] == "b"
-
-    def test_skips_invalid_json(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
-        raw = tmp_path / "raw"
-        (raw / "good.json").write_text('{"ok": true}')
-        (raw / "bad.json").write_text("not json")
-        results = ws.load_raw_json()
-        assert len(results) == 1
-
-    def test_empty_raw_dir(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
-        results = ws.load_raw_json()
-        assert results == []
-
-
-class TestSnapshotAndDiffOutputs:
-    """Tests for LocalWorkspace.snapshot_outputs and diff_outputs."""
-
-    def test_detects_new_file(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
-        before = ws.snapshot_outputs()
-        (tmp_path / "outputs" / "new.csv").write_text("data")
-        changed = ws.diff_outputs(before)
-        assert len(changed) == 1
-        assert "new.csv" in changed[0]
-
-    def test_detects_modified_file(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
-        outputs = tmp_path / "outputs"
-        f = outputs / "existing.csv"
+        # 2 user files + manifest.jsonl
+        assert result["total_files"] >= 3
+
+
+class TestSummarizeForPrompt:
+    """Tests for generate_summary."""
+
+    def test_empty_workspace_summary(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        summary = ws.generate_summary()
+        assert "## Workspace State" in summary
+        assert "Artifacts: 0 total" in summary
+        assert "Recent artifacts: none" in summary
+
+    def test_summary_includes_counts_and_recent(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        ws.save_artifact("raw", "a.json", '{"k": "v"}')
+        ws.save_artifact("output", "b.csv", "a,b")
+        ws.save_artifact("context", "c.md", "# context")
+
+        summary = ws.generate_summary(max_artifacts=3)
+        assert "Artifacts: 3 total (raw: 1, output: 1, context: 1)" in summary
+        assert "a_000001 [raw] raw/a.json" in summary
+        assert "a_000002 [output] output/b.csv" in summary
+        assert "a_000003 [context] context/c.md" in summary
+
+    def test_summary_respects_max_artifacts(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        ws.save_artifact("raw", "a.json", "{}")
+        ws.save_artifact("raw", "b.json", "{}")
+        ws.save_artifact("raw", "c.json", "{}")
+
+        summary = ws.generate_summary(max_artifacts=2)
+        assert "a_000003 [raw] raw/c.json" in summary
+        assert "a_000002 [raw] raw/b.json" in summary
+        assert "a_000001 [raw] raw/a.json" not in summary
+        assert "... and 1 more artifact(s)" in summary
+
+    def test_summary_truncates_long_preview(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        long_text = "x" * 200
+        ws.save_artifact("raw", "long.txt", long_text)
+
+        summary = ws.generate_summary(max_summary_chars=50)
+        assert ("x" * 50) + "..." in summary
+
+
+class TestSnapshotPaths:
+    """Tests for v2 snapshot_paths / diff_snapshot."""
+
+    def test_snapshot_captures_files(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        (tmp_path / "raw" / "a.json").write_text("{}")
+        snap = ws.snapshot_paths(["raw"])
+        assert len(snap.files) == 1
+        assert "raw/a.json" in snap.files
+
+    def test_diff_detects_created(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        before = ws.snapshot_paths(["output"])
+        (tmp_path / "output" / "new.csv").write_text("x")
+        after = ws.snapshot_paths(["output"])
+        delta = ws.diff_snapshot(before, after)
+        assert len(delta.created) == 1
+        assert delta.created[0].relative_path == "output/new.csv"
+
+    def test_diff_detects_deleted(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        f = tmp_path / "output" / "old.csv"
+        f.write_text("x")
+        before = ws.snapshot_paths(["output"])
+        f.unlink()
+        after = ws.snapshot_paths(["output"])
+        delta = ws.diff_snapshot(before, after)
+        assert len(delta.deleted) == 1
+
+    def test_diff_detects_modified(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path))
+        f = tmp_path / "output" / "data.csv"
         f.write_text("old")
-        before = ws.snapshot_outputs()
-        time.sleep(0.05)  # Ensure mtime changes
-        f.write_text("new")
-        changed = ws.diff_outputs(before)
-        assert len(changed) == 1
-
-    def test_no_changes(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
-        outputs = tmp_path / "outputs"
-        (outputs / "stable.csv").write_text("data")
-        before = ws.snapshot_outputs()
-        changed = ws.diff_outputs(before)
-        assert changed == []
+        before = ws.snapshot_paths(["output"])
+        time.sleep(0.05)
+        f.write_text("new-longer")
+        after = ws.snapshot_paths(["output"])
+        delta = ws.diff_snapshot(before, after)
+        assert len(delta.modified) == 1
 
 
 class TestEnsureDirs:
-    """Tests for LocalWorkspace.ensure_dirs."""
+    """Tests for LocalAgentWorkspace.ensure_dirs."""
 
-    def test_creates_raw_outputs_and_context(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path / "new_workspace"))
+    def test_creates_all_directories(self, tmp_path: Path) -> None:
+        ws = LocalAgentWorkspace(str(tmp_path / "new_workspace"))
         assert (tmp_path / "new_workspace" / "raw").is_dir()
-        assert (tmp_path / "new_workspace" / "outputs").is_dir()
+        assert (tmp_path / "new_workspace" / "output").is_dir()
         assert (tmp_path / "new_workspace" / "context").is_dir()
+        assert (tmp_path / "new_workspace" / "scratch").is_dir()
+        assert (tmp_path / "new_workspace" / "meta").is_dir()
+        assert (tmp_path / "new_workspace" / "meta" / "manifest.jsonl").is_file()
 
     def test_idempotent(self, tmp_path: Path) -> None:
-        ws = LocalWorkspace(str(tmp_path))
+        ws = LocalAgentWorkspace(str(tmp_path))
         ws.ensure_dirs()
         ws.ensure_dirs()  # Should not raise
         assert (tmp_path / "raw").is_dir()
+
+
+class TestCleanup:
+    """Tests for cleanup."""
+
+    def test_cleanup_removes_root(self, tmp_path: Path) -> None:
+        ws_dir = tmp_path / "ws"
+        ws = LocalAgentWorkspace(str(ws_dir))
+        ws.save_artifact("raw", "a.json", "{}")
+        assert ws_dir.exists()
+        ws.cleanup(remove_root=True)
+        assert not ws_dir.exists()
+
+    def test_cleanup_noop_without_flag(self, tmp_path: Path) -> None:
+        ws_dir = tmp_path / "ws"
+        ws = LocalAgentWorkspace(str(ws_dir))
+        ws.cleanup(remove_root=False)
+        assert ws_dir.exists()
+
+
+class TestArtifactIndexResume:
+    """Tests for manifest-based artifact index resume across instances."""
+
+    def test_index_survives_restart(self, tmp_path: Path) -> None:
+        ws1 = LocalAgentWorkspace(str(tmp_path))
+        ws1.save_artifact("raw", "a.json", "{}")
+        ws1.save_artifact("raw", "b.json", "{}")
+        # New instance picks up where the old one left off
+        ws2 = LocalAgentWorkspace(str(tmp_path))
+        ref = ws2.save_artifact("raw", "c.json", "{}")
+        assert ref.index == 3