diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml deleted file mode 100644 index 3b873bc1..00000000 --- a/.github/workflows/claude-code-review.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: Claude Code Review -on: - pull_request: - types: [opened, synchronize, ready_for_review, reopened] - -jobs: - claude-review: - runs-on: ubuntu-latest - permissions: - contents: read - pull-requests: read - issues: read - id-token: write - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 1 - - - name: Run Claude Code Review - id: claude-review - uses: anthropics/claude-code-action@v1 - with: - claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - plugin_marketplaces: 'https://github.com/anthropics/claude-code.git' - plugins: 'code-review@claude-code-plugins' - prompt: '/code-review:code-review ${{ github.repository }}/pull/${{ github.event.pull_request.number }}' diff --git a/.gitignore b/.gitignore index 7b5a45c4..9c24afdf 100644 --- a/.gitignore +++ b/.gitignore @@ -222,4 +222,7 @@ downloads/ benchmarks/ routine_output/ bluebox_workspace/ -api_indexing_output/ \ No newline at end of file +api_indexing_output/ +api_indexing_output*/ +agent_workspace/ +agent_workspace*/ \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index 5417a4ad..36629d9e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -24,7 +24,8 @@ This file provides context and guidelines for working with the bluebox codebase. - `bluebox-monitor --host 127.0.0.1 --port 9222 --output-dir ./cdp_captures --url about:blank --incognito` - Start browser monitoring - `bluebox-discover --task "your task description" --cdp-captures-dir ./cdp_captures --output-dir ./routine_discovery_output --llm-model gpt-5.2` - Discover routines from captures - `bluebox-execute --routine-path example_data/example_routines/amtrak_one_way_train_search_routine.json --parameters-path example_data/example_routines/amtrak_one_way_train_search_input.json` - Execute a routine -- `bluebox-agent-adapter --agent RoutineDiscoveryAgentBeta --cdp-captures-dir ./cdp_captures` - Start HTTP adapter for programmatic agent interaction (see Agent HTTP Adapter section below) +- `bluebox-api-index --cdp-captures-dir ./cdp_captures --task "your task" --output-dir ./api_indexing_output --model gpt-5.2 --post-run-analysis` - Run the API indexing pipeline (exploration + routine construction) +- `bluebox-agent-adapter --agent NetworkSpecialist --cdp-captures-dir ./cdp_captures` - Start HTTP adapter for programmatic agent interaction (see Agent HTTP Adapter section below) - `bluebox-agent-adapter --list-agents` - List all available agents and their required data ### Chrome Debug Mode @@ -107,23 +108,35 @@ This file provides context and guidelines for working with the bluebox codebase. - `bluebox/utils/js_utils.py` - JavaScript code generation - `bluebox/utils/web_socket_utils.py` - WebSocket utilities for CDP - `bluebox/sdk/client.py` - Main SDK client +- `bluebox/workspace.py` - Agent workspace (artifact-oriented file I/O with provenance tracking) ### Agents -AI agents that power routine discovery and conversational interactions: +AI agents that power routine discovery, API indexing, and conversational interactions. All agents inherit from `AbstractAgent` (`bluebox/agents/abstract_agent.py`). +**Core agents:** - `bluebox/agents/routine_discovery_agent.py` - Analyzes CDP captures to generate routines (identifies transactions, extracts/resolves variables, constructs operations) - `bluebox/agents/guide_agent.py` - Conversational agent for guiding users through routine creation/editing (maintains chat history, dynamic tool registration) +- `bluebox/agents/bluebox_agent.py` - General-purpose conversational agent + +**API Indexing Pipeline agents:** +- `bluebox/agents/principal_investigator.py` - Orchestrator: plans routine catalog, dispatches experiments to workers, reviews results, assembles and ships routines +- `bluebox/agents/workers/experiment_worker.py` - Browser-capable execution agent: live browser tools + recorded capture lookup tools, executes experiments +- `bluebox/agents/routine_inspector.py` - Independent quality gate: scores routines on 6 dimensions, hard-fails on 4xx/5xx or unresolved placeholders + +**Specialists** (domain-specific agents for exploration): +- `bluebox/agents/specialists/network_specialist.py` - Network traffic analysis +- `bluebox/agents/specialists/dom_specialist.py` - DOM structure analysis +- `bluebox/agents/specialists/interaction_specialist.py` - UI interaction analysis +- `bluebox/agents/specialists/js_specialist.py` - JavaScript file analysis +- `bluebox/agents/specialists/value_trace_resolver_specialist.py` - Storage & window property analysis **Agent HTTP Adapter** (`bluebox/scripts/agent_http_adapter.py`): -HTTP wrapper that exposes any `AbstractAgent` (or `AbstractSpecialist`) subclass as a JSON API, enabling programmatic interaction via curl. Agents are auto-discovered at runtime — adding a new `AbstractSpecialist` subclass makes it available with zero adapter changes. +HTTP wrapper that exposes any `AbstractAgent` subclass as a JSON API, enabling programmatic interaction via curl. Agents are auto-discovered at runtime — adding a new `AbstractAgent` subclass makes it available with zero adapter changes. ```bash -# Start adapter (default: RoutineDiscoveryAgentBeta) -bluebox-agent-adapter --cdp-captures-dir ./cdp_captures --port 8765 -q - -# Or pick a specific agent +# Start adapter with a specific agent bluebox-agent-adapter --agent NetworkSpecialist --cdp-captures-dir ./cdp_captures # Agents with no data requirements (e.g. BlueBoxAgent) don't need --cdp-captures-dir @@ -134,7 +147,7 @@ Endpoints: - `GET /health` — liveness check - `GET /status` — agent type, chat state, discovery support - `POST /chat {"message": "..."}` — send a chat message (all agents) -- `POST /discover {"task": "..."}` — run discovery/autonomous mode (specialists + RoutineDiscoveryAgentBeta) +- `POST /discover {"task": "..."}` — run discovery/autonomous mode - `GET /routine` — retrieve discovered routine JSON **Best practices when calling from Claude Code or scripts:** @@ -147,6 +160,7 @@ Endpoints: **LLM Infrastructure:** - `bluebox/llms/data_loaders/` - Specialized data loaders for CDP capture analysis: - `NetworkDataLoader` - HTTP request/response transactions + - `DOMDataLoader` - DOM snapshots (string-interning tables, element classification by tag family) - `JSDataLoader` - JavaScript files - `StorageDataLoader` - Cookies, localStorage, sessionStorage, IndexedDB - `WindowPropertyDataLoader` - Window property changes @@ -156,18 +170,50 @@ Endpoints: **Import patterns:** ```python +from bluebox.agents.abstract_agent import AbstractAgent, agent_tool, AgentCard from bluebox.agents.guide_agent import GuideAgent from bluebox.agents.routine_discovery_agent import RoutineDiscoveryAgent +from bluebox.agents.principal_investigator import PrincipalInvestigator +from bluebox.agents.workers.experiment_worker import ExperimentWorker +from bluebox.agents.routine_inspector import RoutineInspector +from bluebox.workspace import AgentWorkspace, LocalAgentWorkspace from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader +from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader from bluebox.llms.data_loaders.js_data_loader import JSDataLoader ``` +### Workspace + +The workspace (`bluebox/workspace.py`) is an artifact-oriented file I/O system attached to agents. Each workspace has a strict directory layout: + +- `raw/` (read-only): tool result artifacts and mounted external files +- `output/`: agent-generated deliverables +- `context/`: reusable notes/context saved for later use in the same run +- `meta/`: system-managed metadata (`manifest.jsonl`, `input_mounts.jsonl`) — not editable +- `scratch/`: ephemeral scratch space + +External files (e.g. CDP capture JSONL) can be mounted into `raw/` via hardlinks using `attach_input_file()`. The `save_artifact()` API records provenance in `meta/manifest.jsonl` (SHA-256, size, content type, timestamp). + +### API Indexing Pipeline + +End-to-end pipeline (`bluebox-api-index`) that turns raw CDP captures into a catalog of executable routines. + +**Phase 1 — Exploration** (4 specialists in parallel): Network, Storage, DOM, and UI specialists each produce a structured exploration summary. + +**Phase 2 — Routine Construction**: PrincipalInvestigator reads summaries, dispatches ExperimentWorker agents, reviews results, assembles routines, submits to RoutineInspector for quality gating. Incremental persistence to disk. PI crash recovery via DiscoveryLedger. + +**Data models:** +- `bluebox/data_models/orchestration/` - `DiscoveryLedger`, `ExperimentEntry`, `RoutineSpec`, `RoutineAttempt`, `RoutineCatalog`, `RoutineInspectionResult` +- `bluebox/data_models/api_indexing/` - `NetworkExplorationSummary`, `StorageExplorationSummary`, `DOMExplorationSummary`, `UIExplorationSummary` + ### Important Patterns - **Routine Execution**: Operations execute sequentially, maintaining state via `RoutineExecutionContext` - **Placeholder Resolution**: All parameters use `{{paramName}}` format; `Parameter.type` drives coercion at runtime - **Session Storage**: Use `session_storage_key` to store and retrieve data between operations - **CDP Sessions**: Use flattened sessions for multiplexing via `session_id` +- **Agent Tools**: Decorate with `@agent_tool()`. Supports `persist` (`NEVER`/`ALWAYS`/`OVERFLOW`), `max_characters`, and `token_optimized` parameters +- **Agent Card**: Every concrete `AbstractAgent` subclass must declare an `AGENT_CARD` ### Common Gotchas diff --git a/README.md b/README.md index f8772c8e..08188f92 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ bluebox-agent --context-file path/to/agent_context.json ## Create your own routines -To learn about the core technology powering BlueBox, see [routine_discovery.md](routine_discovery.md). +To learn about the core technology powering BlueBox, see [routine_discovery.md](docs/routine_discovery.md). ## Contributing 🤝 diff --git a/bluebox/agent_docs/common-issues/cors-failed-to-fetch.md b/bluebox/agent_docs/common-issues/cors-failed-to-fetch.md new file mode 100644 index 00000000..644caeb8 --- /dev/null +++ b/bluebox/agent_docs/common-issues/cors-failed-to-fetch.md @@ -0,0 +1,101 @@ +# Fetch Fails with TypeError: Failed to fetch (CORS) + +> Fetch operations fail with "TypeError: Failed to fetch" when the browser's origin doesn't match the API server's CORS `Access-Control-Allow-Origin` header. Fix by adding a `navigate` operation to the allowed origin before any `fetch`. Related: [fetch.md](../operations/fetch.md), [navigation.md](../operations/navigation.md) + +**Symptom:** Fetch operation returns `TypeError: Failed to fetch` or the response data is `null`/empty despite the endpoint working in experiments. + +**Root Cause:** The routine executor starts from `about:blank` (origin = `null`). Many APIs restrict CORS to their own website origin. For example, `api.nasdaq.com` only allows requests from origin `https://www.nasdaq.com`. Without a `navigate` operation first, the browser's origin is `null` and every `fetch` is blocked by CORS. + +**How to detect:** If an experiment confirmed the API works from the site's origin (e.g. `browser_eval_js(fetch(...))` succeeded after navigating to `www.example.com`) but the routine's `fetch` operation fails with `TypeError: Failed to fetch`, the routine is missing a `navigate` step. + +**Solutions:** + +| Problem | Fix | +|---------|-----| +| API requires same-origin (e.g. `api.example.com` allows `www.example.com`) | Add `navigate` to the allowed origin before `fetch` | +| API requires `Origin`/`Referer` headers | Add `"Origin"` and `"Referer"` to fetch headers | +| API is on the same domain as the website | Add `navigate` to the website URL first | +| Cloudflare/WAF blocks CORS preflight (OPTIONS → 403) | Set `"credentials": "omit"` on the fetch endpoint — this avoids the preflight OPTIONS request entirely, bypassing the block. Works for public APIs that don't need cookies | +| All else fails | Use `js_evaluate` with `fetch()` instead of a `fetch` operation — JS fetch from the navigated page context has the correct origin | + +**RULE:** Every routine that calls an external API SHOULD start with a `navigate` operation to establish the correct browser origin. This is cheap (one page load) and prevents CORS issues. + +**Example: Navigate to allowed origin, then fetch from API subdomain** +```json +[ + {"type": "navigate", "url": "https://www.example.com"}, + { + "type": "fetch", + "endpoint": { + "url": "https://api.example.com/api/data?q={{query}}", + "method": "GET", + "headers": { + "Accept": "application/json, text/plain, */*" + } + }, + "session_storage_key": "result" + }, + {"type": "return", "session_storage_key": "result"} +] +``` + +**Example: Navigate + auth token + data fetch (common pattern)** +```json +[ + {"type": "navigate", "url": "https://www.example.com"}, + { + "type": "fetch", + "endpoint": { + "url": "https://api.example.com/api/token", + "method": "POST", + "headers": {"Content-Type": "application/json"}, + "body": {"applicationName": "web"} + }, + "session_storage_key": "auth_response" + }, + { + "type": "js_evaluate", + "expression": "(function(){ var r = JSON.parse(sessionStorage.getItem('auth_response')); return r.data.token; })()", + "session_storage_key": "bearer_token" + }, + { + "type": "fetch", + "endpoint": { + "url": "https://api.example.com/api/data", + "method": "GET", + "headers": { + "Authorization": "Bearer {{sessionStorage.bearer_token}}", + "Accept": "application/json" + } + }, + "session_storage_key": "data_result" + }, + {"type": "return", "session_storage_key": "data_result"} +] +``` + +**Cloudflare / WAF Blocking Preflight Requests** + +Some APIs behind Cloudflare or other WAFs block CORS preflight (OPTIONS) requests with 403. This happens when `credentials: "include"` triggers a preflight that Cloudflare rejects. The captured network data will show OPTIONS requests returning 403 with `server: cloudflare` and `content-type: text/html`. + +**Fix:** If the API does NOT require cookies or session auth, set `"credentials": "omit"` on the fetch endpoint. This tells the browser NOT to send cookies, which often eliminates the preflight OPTIONS request entirely, bypassing the Cloudflare block. + +**When to try this:** The experiment shows `TypeError: Failed to fetch` AND the captured network data shows OPTIONS preflight returning 403 from Cloudflare. Try `credentials: "omit"` first — many public search/listing APIs work without cookies. + +```json +[ + {"type": "navigate", "url": "https://www.example.com"}, + { + "type": "fetch", + "endpoint": { + "url": "https://api.example.com/search", + "method": "POST", + "headers": {"Content-Type": "application/json", "Accept": "application/json"}, + "body": {"query": "{{search_term}}", "page": "{{page}}"}, + "credentials": "omit" + }, + "session_storage_key": "search_result" + }, + {"type": "return", "session_storage_key": "search_result"} +] +``` diff --git a/bluebox/agent_docs/core/auth-token-resolution.md b/bluebox/agent_docs/core/auth-token-resolution.md new file mode 100644 index 00000000..ffe4532c --- /dev/null +++ b/bluebox/agent_docs/core/auth-token-resolution.md @@ -0,0 +1,302 @@ +# Auth & Token Resolution Strategies + +Tokens and API keys are the #1 reason routines fail with 401/403 errors. This guide covers every way to discover, extract, and resolve auth credentials at runtime. + +## The Two Categories + +| Category | Lifespan | Strategy | +|----------|----------|----------| +| **Static credentials** (API keys, subscription keys, client IDs) | Long-lived or permanent | Hardcode in the routine | +| **Dynamic tokens** (JWT, Bearer, session tokens, CSRF) | Short-lived, expire | Fetch at runtime within the routine | + +## Where Tokens Live — Discovery Checklist + +When exploring a site's auth, check ALL of these sources. Tokens can come from anywhere. + +### 1. Network Requests (Most Common) + +The captured session shows exactly which headers and tokens were used. + +**How to find them:** +- Use `capture_search_transactions` to search for keywords: "token", "auth", "key", "bearer", "jwt" +- Use `capture_get_transaction` to inspect specific request headers +- Look for `Authorization: Bearer ...` headers +- Look for custom headers: `Ocp-Apim-Subscription-Key`, `X-Api-Key`, `X-Auth-Token` +- Look for POST requests to `/token`, `/auth`, `/login`, `/oauth` endpoints + +**What you'll find:** +- The token endpoint URL +- The exact headers and body needed to get a token +- The response shape (where the token lives in the JSON response) +- Any static API keys used alongside the token + +### 2. DOM — Inline Scripts and Meta Tags + +Sites often embed tokens or config objects directly in the HTML. + +**Common patterns:** +```html + + + + + + + + +
+``` + +**Routine resolution:** +```json +{"type": "navigate", "url": "https://example.com"}, +{"type": "js_evaluate", "js": "(function() { return { token: document.querySelector('meta[name=\"csrf-token\"]').content }; })();", "session_storage_key": "csrf_data"} +``` + +Or use placeholders: +```json +"headers": { + "X-CSRF-Token": "{{meta:csrf-token}}", + "X-Api-Key": "{{windowProperty:__CONFIG__.apiKey}}" +} +``` + +### 3. Browser Storage (localStorage / sessionStorage) + +Sites store tokens in browser storage after the user (or the site's JS) authenticates. + +**How to discover:** +- Navigate to the site, then use `js_evaluate` to dump storage: +```javascript +(function() { + var ss = {}; + for (var i = 0; i < sessionStorage.length; i++) { + var k = sessionStorage.key(i); + ss[k] = sessionStorage.getItem(k); + } + var ls = {}; + for (var i = 0; i < localStorage.length; i++) { + var k = localStorage.key(i); + ls[k] = localStorage.getItem(k); + } + return { sessionStorage: ss, localStorage: ls }; +})() +``` + +**Common keys to look for:** `token`, `access_token`, `auth`, `jwt`, `session`, `user` + +**Routine resolution:** +```json +"headers": { + "Authorization": "Bearer {{localStorage:auth.access_token}}", + "X-Session": "{{sessionStorage:session.token}}" +} +``` + +### 4. Cookies + +Some sites use cookie-based auth — the token IS the cookie. + +**How to discover:** +- Use `get_cookies` operation to see all cookies including HttpOnly ones +- Look for cookies named: `session`, `token`, `auth`, `sid`, `csrf`, `XSRF-TOKEN` + +**Routine resolution — two approaches:** + +**a) Let the browser send cookies automatically:** +```json +[ + {"type": "navigate", "url": "https://example.com"}, + {"type": "sleep", "timeout_seconds": 2.0}, + { + "type": "fetch", + "endpoint": { + "url": "https://example.com/api/data", + "method": "GET", + "credentials": "include" + } + } +] +``` + +**b) Extract cookie value explicitly:** +```json +"headers": { + "X-XSRF-TOKEN": "{{cookie:XSRF-TOKEN}}" +} +``` + +### 5. Window Properties (JavaScript Globals) + +Sites set global JS variables with config/auth info. + +**How to discover:** +```javascript +(function() { + var keys = ['__CONFIG__', '__INITIAL_STATE__', 'ENV', '__NEXT_DATA__', + 'config', 'appConfig', '__APP_DATA__', '_env']; + var found = {}; + keys.forEach(function(k) { + if (window[k]) found[k] = window[k]; + }); + return found; +})() +``` + +**Routine resolution:** +```json +"headers": { + "X-Api-Key": "{{windowProperty:__CONFIG__.apiKey}}" +} +``` + +### 6. API Token Endpoints (Runtime Fetch) + +The most robust approach for dynamic tokens — fetch the token at runtime. + +**Pattern: fetch token → extract → use in subsequent requests** +```json +[ + {"type": "navigate", "url": "https://example.com", "sleep_after_navigation_seconds": 2.0}, + { + "type": "fetch", + "endpoint": { + "url": "https://example.com/api/auth/token", + "method": "POST", + "headers": { + "Content-Type": "application/json", + "X-Api-Key": "HARDCODED_SITE_KEY_FROM_CAPTURES" + }, + "body": { + "applicationName": "website", + "channel": "Web" + }, + "credentials": "same-origin" + }, + "session_storage_key": "token_response" + }, + { + "type": "fetch", + "endpoint": { + "url": "https://example.com/api/data", + "method": "GET", + "headers": { + "Authorization": "Bearer {{sessionStorage:token_response.token}}", + "X-Api-Key": "HARDCODED_SITE_KEY_FROM_CAPTURES" + } + }, + "session_storage_key": "data_result" + }, + {"type": "return", "session_storage_key": "data_result"} +] +``` + +### 7. JS Evaluation (Extract from Running Page) + +When tokens are generated by the site's JavaScript and aren't in storage or DOM. + +**Pattern: navigate → let site JS run → extract token via JS eval** +```json +[ + {"type": "navigate", "url": "https://example.com", "sleep_after_navigation_seconds": 3.0}, + { + "type": "js_evaluate", + "js": "(function() { try { var state = JSON.parse(sessionStorage.getItem('persist:root')); var auth = JSON.parse(state.auth); return { token: auth.accessToken }; } catch(e) { return { error: String(e) }; } })();", + "session_storage_key": "extracted_token" + }, + { + "type": "fetch", + "endpoint": { + "url": "https://example.com/api/data", + "headers": { + "Authorization": "Bearer {{sessionStorage:extracted_token.token}}" + } + }, + "session_storage_key": "result" + }, + {"type": "return", "session_storage_key": "result"} +] +``` + +## Experiment Strategies for the PI + +When a site requires auth, the PI should dispatch experiments that explore MULTIPLE resolution strategies. Don't just try one approach and give up. + +### Experiment 1: Discover What Auth Exists + +``` +"Navigate to {site_url}, wait for page load, then inspect ALL available auth sources: +1. Run JS to dump sessionStorage, localStorage, and window config objects +2. Use get_cookies to list all cookies +3. Check DOM for meta tags with csrf/token/key attributes +4. Use capture_search_transactions to find requests with 'token' or 'auth' in the URL + +Report back: what tokens/keys did you find, where did they come from, and what +do they look like (first 20 chars)? We saw '{observed_token_prefix}...' in the +captured session — is it still the same or has it changed?" +``` + +### Experiment 2: Try Token Endpoint + +``` +"The captured session shows a token endpoint at {token_url}. +1. Use capture_get_transaction to get the EXACT headers and body from the capture +2. Navigate to {site_url} first to establish cookies +3. Call the token endpoint with the same headers/body +4. If it returns a token, store it and try calling {data_endpoint} with + Authorization: Bearer {token} +5. If it fails, try variations: different Content-Type, with/without credentials, + with cookies via credentials:'include' + +The captured request had these headers: {captured_headers} +The captured body was: {captured_body}" +``` + +### Experiment 3: Try Page-Embedded Token + +``` +"Navigate to {site_url} and wait 3 seconds for JS to execute. +Then try to find auth tokens in the page: +1. Check window.__CONFIG__, window.__INITIAL_STATE__, window.ENV +2. Check sessionStorage and localStorage for 'token', 'auth', 'jwt' keys +3. Check meta tags for csrf-token, api-key +4. If you find a token, try using it to call {data_endpoint} + +In the captured session, we saw a token that looked like: '{token_sample}' +It may be the same static value or may have changed." +``` + +### Experiment 4: Try Cookie-Based Auth + +``` +"Navigate to {site_url} and wait for page load. +The site may use cookie-based auth (the navigation itself establishes the session). +1. After navigation, call {data_endpoint} with credentials:'include' to send cookies +2. If that works, the routine just needs navigate + fetch with credentials:'include' +3. If it fails, dump cookies with get_cookies to see what cookies exist +4. Try different credential modes: 'same-origin' vs 'include'" +``` + +## Common Auth Patterns by Site Type + +| Site Type | Typical Auth | Strategy | +|-----------|-------------|----------| +| Modern SPA (React/Angular) | JWT via token endpoint | Fetch token → use Bearer header | +| Traditional server-rendered | Session cookie | Navigate → fetch with `credentials: "include"` | +| Public API with key | Static API key in header | Hardcode from captures | +| CSRF-protected forms | CSRF token in meta/cookie | Extract via `{{meta:csrf-token}}` or `{{cookie:XSRF-TOKEN}}` | +| OAuth-protected | Access token via OAuth flow | Fetch token endpoint with client credentials | +| Azure API Management | Subscription key + JWT | Hardcode sub key, fetch JWT at runtime | + +## Key Rules + +1. **Static keys are HARDCODED** — API keys, subscription keys, client IDs from captures go directly into the routine. Never expose them as user parameters. +2. **Dynamic tokens are FETCHED** — JWT, Bearer, session tokens must be obtained at runtime via a fetch or js_evaluate operation within the routine. +3. **Always navigate first** — Most auth requires being on the site's origin for cookies and CORS to work. +4. **Check multiple sources** — A token might be in storage, DOM, cookies, AND network requests. Find the most reliable source. +5. **Include observed values in experiments** — Tell the worker what the token looked like in the captured session so they know what to look for and can verify if it's static or dynamic. +6. **The PI must try multiple strategies** — If the token endpoint fails, try page-embedded tokens. If those fail, try cookie-based auth. If that fails, try JS evaluation. Don't give up after one approach. diff --git a/bluebox/agent_docs/core/naming-conventions.md b/bluebox/agent_docs/core/naming-conventions.md new file mode 100644 index 00000000..7da58896 --- /dev/null +++ b/bluebox/agent_docs/core/naming-conventions.md @@ -0,0 +1,111 @@ +# Routine Naming & Documentation Conventions + +Routines are vectorized and stored in databases for other agents to discover via semantic search. Clear, precise metadata is **essential** — a routine with a vague name or missing description is invisible and unusable. + +## Routine Name + +**Format:** `snake_case` with a `verb_site_noun` pattern and **3+ segments**. + +The name MUST include the site or service name so it makes sense in isolation. Another agent reading ONLY the name — with no other context — should know what site this targets and what it does. + +| Good | Bad | Why | +|------|-----|-----| +| `get_premierleague_standings` | `get_standings` | Standings from where? | +| `search_premierleague_matches_by_season` | `search_matches` | Which sport? Which site? | +| `fetch_amtrak_train_schedules` | `get_data` | Completely generic | +| `download_arxiv_paper_pdf` | `download_paper` | Which paper repository? | +| `list_espn_upcoming_fixtures` | `list_fixtures` | Which sports platform? | +| `get_github_repo_stars` | `get_content_item` | Content from where? What item? | + +**Rules:** +- Always start with a verb: `get_`, `search_`, `fetch_`, `list_`, `download_`, `create_`, `submit_` +- Always include the **site/service name**: `premierleague`, `amtrak`, `arxiv`, `espn`, `github` +- Include the domain noun: `standings`, `matches`, `flights`, `players`, `teams` +- Add qualifiers when needed: `_by_season`, `_one_way`, `_with_details` +- Use `snake_case` only — no camelCase, no spaces, no hyphens +- Minimum 3 underscore-separated segments: `verb_site_noun` + +## Routine Description + +**Minimum 8 words.** Must answer three questions: + +1. **What does it do?** (the action) +2. **What inputs does it take?** (the parameters) +3. **What data does it return?** (the output structure) + +### Examples + +**Good (all three questions answered):** +> Fetches Premier League standings for a given competition ID and season ID, returning team names, positions, wins, draws, losses, goals scored, goals conceded, and total points. + +> Searches for one-way flights from an origin airport to a destination on a specific date, returning a list of flights with airline, departure time, arrival time, duration, stops, and price. + +**Bad (missing information):** +> "Get standings" — too short, missing input/output info +> "A routine for the Premier League" — doesn't say what it does or returns +> "Fetches data from the API" — which API? what data? what format? + +### Template + +> `{Verb}s {what} for a given {param1} and {param2}, returning {field1}, {field2}, {field3}, and {field4}.` + +## Parameter Names + +**Format:** `snake_case`, descriptive, never ambiguous. + +| Good | Bad | Why | +|------|-----|-----| +| `competition_id` | `id` | Ambiguous — id of what? | +| `season_year` | `year` | Could mean any year | +| `departure_date` | `date` | Which date? | +| `team_name` | `name` | Name of what? | +| `search_query` | `q` | Cryptic | +| `page_number` | `page` | Acceptable but `page_number` is clearer | + +## Parameter Descriptions + +**Minimum 3 words.** Must explain: +1. What the value represents +2. Expected format or range (when applicable) + +### Examples + +**Good:** +> "The unique competition identifier, typically a numeric ID (e.g. 1 for Premier League)" +> "Departure date in YYYY-MM-DD format (e.g. 2024-12-25)" +> "Season year as a 4-digit number (e.g. 2024 for the 2024-25 season)" + +**Bad:** +> "ID" — too terse, ambiguous +> "The season" — doesn't explain format +> "query" — just restates the parameter name + +## Non-Obvious Parameters: Sourcing is MANDATORY + +If a parameter value is NOT something a human would naturally know — opaque numeric IDs, internal slugs, encoded tokens, UUIDs — the description **MUST** explain where to get valid values. Without sourcing, the routine is unusable. + +**How to identify non-obvious parameters:** names ending in `_id`, `_slug`, `_code`, `_token`, `_key`, `_hash`, or any numeric/integer parameter that represents an internal identifier. + +### Examples + +**Good (includes sourcing):** +> "Internal competition ID. Obtain from the get_competitions routine or the /competitions API endpoint. Example: 1 = Premier League, 2 = Championship." + +> "Season ID as used by the Premier League API. Use the get_seasons routine to list valid season IDs for a competition. Example: 418 = 2023-24 season." + +> "Team slug as it appears in the site URL path (e.g. 'arsenal', 'manchester-united'). Find by calling get_teams or navigating to the team page." + +**Bad (no sourcing — where do I get these?):** +> "The competition ID" — which competition? where do I look it up? +> "Season identifier" — what values are valid? how do I find them? +> "Internal team code" — completely opaque, no way to discover valid values + +**Rule of thumb:** if you can't google the value, the description must say how to get it. + +## Why This Matters + +Other agents search the routine database with natural language queries like: +- "Find me Premier League standings" +- "Search for flights from LAX to JFK" + +If your routine is named `get_data` with description "fetches data", it will never match these queries. But `get_league_standings` with a rich description will rank highly and be selected for execution. diff --git a/bluebox/agents/abstract_agent.py b/bluebox/agents/abstract_agent.py index 02e4263a..8fc0d731 100644 --- a/bluebox/agents/abstract_agent.py +++ b/bluebox/agents/abstract_agent.py @@ -19,14 +19,20 @@ import json import functools +import re from abc import ABC, abstractmethod from concurrent.futures import Future, ThreadPoolExecutor, as_completed from dataclasses import dataclass from datetime import datetime -from typing import Any, Callable, ClassVar, get_type_hints +from enum import StrEnum +from pathlib import Path +from textwrap import dedent +from typing import Any, Callable, ClassVar, NamedTuple, get_type_hints -from pydantic import TypeAdapter, ValidationError +import jsonschema +from pydantic import BaseModel, TypeAdapter, ValidationError +from bluebox.workspace import AgentWorkspace from bluebox.data_models.llms.interaction import ( BrowserAgentStepEmittedMessage, Chat, @@ -43,16 +49,51 @@ ToolInvocationStatus, ) from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel +from bluebox.data_models.orchestration.result import SpecialistResultWrapper from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader, FileType from bluebox.llms.llm_client import LLMClient from bluebox.llms.tools.tool_utils import extract_description_from_docstring, generate_parameters_schema +from bluebox.utils.code_execution_sandbox import ( + BLOCKED_MODULES, + BLOCKED_PATTERNS, + execute_python_sandboxed, + get_active_sandbox_mode, + get_workaround_for_error, +) from bluebox.utils.data_utils import format_bytes -from bluebox.utils.llm_utils import token_optimized +from bluebox.utils.llm_utils import token_optimized as token_optimized_decorator from bluebox.utils.logger import get_logger logger = get_logger(name=__name__) +class ToolResultPersistMode(StrEnum): + NEVER = "never" + ALWAYS = "always" + OVERFLOW = "overflow" + + +# Keep persisted tool previews small so iterative runs don't blow context. +PERSISTED_TOOL_PREVIEW_MAX_CHARS = 800 + + +class AgentExecutionMode(StrEnum): + """Execution mode for agent loops.""" + CONVERSATIONAL = "conversational" + AUTONOMOUS = "autonomous" + + +class AutonomousRunConfig(NamedTuple): + """ + Configuration for autonomous agent runs. + + min_iterations controls when finalize tools become available. + max_iterations controls when the autonomous loop gives up. + """ + min_iterations: int = 3 + max_iterations: int = 10 + + @dataclass(frozen=True) class AgentCard: """ @@ -72,6 +113,36 @@ class _ToolMeta: description: str # tool description shown to the LLM parameters: dict[str, Any] # JSON Schema for tool parameters availability: bool | Callable[..., bool] # whether the tool should be registered right now + persist: ToolResultPersistMode = ToolResultPersistMode.NEVER + max_characters: int = 10_000 + token_optimized: bool = False + + +def _serialize_tool_result(tool_result: Any) -> tuple[str, str]: + try: + return json.dumps(tool_result, ensure_ascii=False, default=str, indent=2), "json" + except (TypeError, ValueError): + return str(tool_result), "text" + + +def _normalize_file_scope(scope: str) -> str: + """Normalize and validate file tool scope.""" + normalized_scope = scope.strip().lower() + if normalized_scope not in {"workspace", "docs"}: + raise ValueError("scope must be 'workspace' or 'docs'") + return normalized_scope + + +def _parse_search_terms(query: str) -> list[str]: + """Split query text into distinct terms for terms-mode search.""" + seen: set[str] = set() + terms: list[str] = [] + for token in re.split(r"[,\s]+", query): + term = token.strip() + if term and term not in seen: + seen.add(term) + terms.append(term) + return terms def agent_tool( @@ -79,6 +150,9 @@ def agent_tool( parameters: dict[str, Any] | None = None, *, availability: bool | Callable[..., bool] = True, + persist: ToolResultPersistMode = ToolResultPersistMode.NEVER, + max_characters: int = 10_000, + token_optimized: bool = False, ) -> Callable: """ Decorator that marks a method as an agent tool handler. @@ -107,6 +181,12 @@ def _my_tool(self, x: str) -> dict: ... tool is available only when it returns True. Use this for tools gated behind lifecycle state or dynamic conditions (e.g. ``availability=lambda self: self.can_finalize``). + persist: Tool-result persistence policy. + - ToolResultPersistMode.NEVER (default): never persist. + - ToolResultPersistMode.ALWAYS: always persist. + - ToolResultPersistMode.OVERFLOW: persist only if result exceeds max_characters. + max_characters: Character threshold for OVERFLOW mode. + token_optimized: If True, encode tool output with toon for token efficiency. """ def decorator(method: Callable, desc: str | None = None) -> Callable: tool_name = method.__name__.lstrip("_") @@ -125,11 +205,22 @@ def decorator(method: Callable, desc: str | None = None) -> Callable: else: final_parameters = parameters + if not isinstance(persist, ToolResultPersistMode): + raise ValueError( + f"Tool {tool_name} has invalid persist value: {persist!r}. " + "Use ToolResultPersistMode values.", + ) + if max_characters <= 0: + raise ValueError(f"Tool {tool_name} must have max_characters > 0") + method._tool_meta = _ToolMeta( name=tool_name, description=final_description, parameters=final_parameters, availability=availability, + persist=persist, + max_characters=max_characters, + token_optimized=token_optimized, ) return method @@ -162,11 +253,19 @@ class AbstractAgent(ABC): # Class-level configuration (can be overridden by subclasses) AGENT_LOOP_MAX_ITERATIONS: int = 10 AGENT_CARD: ClassVar[AgentCard] # must be defined by every concrete subclass + _subclasses: ClassVar[list[type[AbstractAgent]]] = [] + WORKSPACE_USAGE_SECTION: ClassVar[str] = dedent("""\ + ## Workspace + - Use `raw/` (read-only) for tool-call artifacts (inputs/results), not deliverables. + - Write generated deliverables to `output/`. + - Store reusable notes/context in `context/`. + - `meta/` (read-only) is system-managed and not editable. + """) def __init_subclass__(cls, **kwargs: Any) -> None: """Validate that concrete subclasses define AGENT_CARD.""" super().__init_subclass__(**kwargs) - # skip abstract classes (matches existing naming convention in AbstractSpecialist) + # skip abstract classes if cls.__name__.startswith("Abstract"): return # validate that the subclass defines an AGENT_CARD class variable of type AgentCard @@ -174,6 +273,13 @@ def __init_subclass__(cls, **kwargs: Any) -> None: raise TypeError( f"{cls.__name__} must define an AGENT_CARD class variable of type AgentCard" ) + if cls not in cls._subclasses: + cls._subclasses.append(cls) + + @classmethod + def get_all_subclasses(cls) -> list[type[AbstractAgent]]: + """Return a copy of all registered concrete AbstractAgent subclasses.""" + return cls._subclasses.copy() ## Abstract methods @@ -186,6 +292,7 @@ def _get_system_prompt(self) -> str: def __init__( self, emit_message_callable: Callable[[EmittedMessage], None], + workspace: AgentWorkspace | None = None, persist_chat_callable: Callable[[Chat], Chat] | None = None, persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None, stream_chunk_callable: Callable[[str], None] | None = None, @@ -194,12 +301,17 @@ def __init__( existing_chats: list[Chat] | None = None, documentation_data_loader: DocumentationDataLoader | None = None, on_llm_response: Callable[[LLMChatResponse], None] | None = None, + execution_mode: AgentExecutionMode = AgentExecutionMode.CONVERSATIONAL, + allow_code_execution: bool = False, + code_execution_globals: dict[str, Any] | None = None, ) -> None: """ Initialize the agent. Args: emit_message_callable: Callback to emit messages to the host. + workspace: Optional workspace used for agent file operations. + When omitted, workspace-scoped tools/features are unavailable. persist_chat_callable: Optional callback to persist Chat objects. persist_chat_thread_callable: Optional callback to persist ChatThread. stream_chunk_callable: Optional callback for streaming text chunks. @@ -208,16 +320,46 @@ def __init__( existing_chats: Existing Chat messages if loading from persistence. documentation_data_loader: Optional DocumentationDataLoader for docs/code search tools. on_llm_response: Optional callback invoked after each LLM call with the response (for token tracking). + execution_mode: Agent execution mode (conversational or autonomous). + allow_code_execution: Whether to expose the generic execute_python tool. + code_execution_globals: Globals injected into execute_python sandbox runs. + Must be empty when allow_code_execution is False. """ + normalized_globals = dict(code_execution_globals or {}) + if not allow_code_execution and normalized_globals: + raise ValueError( + "code_execution_globals must be empty when allow_code_execution is False", + ) + self._emit_message_callable = emit_message_callable self._persist_chat_callable = persist_chat_callable self._persist_chat_thread_callable = persist_chat_thread_callable self._stream_chunk_callable = stream_chunk_callable self._documentation_data_loader = documentation_data_loader self._on_llm_response = on_llm_response + self._on_chat_added: Callable[[Chat], None] | None = None + self.execution_mode = execution_mode + self._autonomous_iteration: int = 0 + self._autonomous_config: AutonomousRunConfig = AutonomousRunConfig() + self._task_output_schema: dict[str, Any] | None = None + self._task_output_description: str | None = None + self._notes: list[str] = [] + self._wrapped_result: BaseModel | None = None + self._finalize_with_output_failed = False + self._allow_code_execution = allow_code_execution + self._code_execution_globals = normalized_globals if allow_code_execution else {} + self._sandbox_mode = ( + get_active_sandbox_mode(work_dir_set=workspace is not None) + if allow_code_execution + else "blocklist" + ) self._previous_response_id: str | None = None self._response_id_to_chat_index: dict[str, int] = {} + if workspace is not None and not isinstance(workspace, AgentWorkspace): + raise TypeError("workspace must implement AgentWorkspace when provided") + self._workspace = workspace + self.llm_model = llm_model self.llm_client = LLMClient(llm_model) @@ -236,6 +378,239 @@ def __init__( for chat in existing_chats: self._chats[chat.id] = chat + @agent_tool( + availability=lambda self: self._allow_code_execution, + persist=ToolResultPersistMode.OVERFLOW, + ) + def _execute_python(self, code: str) -> dict[str, Any]: + """ + Execute Python code in a sandbox. + + When a workspace is configured, code runs with `work_dir` set to the + workspace root so file I/O is scoped to that directory. All files + created or modified anywhere in the workspace are tracked and + reported in the response as ``files_created``. + Without a workspace, execution is compute-only and file I/O (open/Path) + remains blocked by sandbox policy. + + Globals passed via `code_execution_globals` are always available. + + Args: + code: Python code to execute. + """ + files_created: list[str] = [] + + if self.has_workspace: + workspace = self._require_workspace() + workspace.ensure_dirs() + + # Snapshot entire workspace before execution for file-tracking + files_before = workspace.snapshot_paths(["."]) + + sandbox_result = execute_python_sandboxed( + code=code, + extra_globals=self._code_execution_globals, + work_dir=str(workspace.root_path.resolve()), + read_only_paths=[ + str((workspace.root_path / "raw").resolve()), + str((workspace.root_path / "meta").resolve()), + ], + ) + + # Diff entire workspace to detect created/modified files + files_after = workspace.snapshot_paths(["."]) + delta = workspace.diff_snapshot(files_before, files_after) + changed_states = delta.created + delta.modified + files_created = [ + str(workspace.root_path / state.relative_path) + for state in changed_states + ] + else: + sandbox_result = execute_python_sandboxed( + code=code, + extra_globals=self._code_execution_globals, + ) + + # Build response + result: dict[str, Any] = {} + + if "error" in sandbox_result: + result["error"] = sandbox_result["error"] + workaround = get_workaround_for_error(sandbox_result["error"]) + if workaround: + result["_hint"] = ( + f"Sandbox restriction: {workaround} " + "Fix the code and call execute_python again." + ) + elif self.has_workspace: + result["_hint"] = ( + "Code failed. Read the error and stdout carefully, then fix and retry." + ) + + output = sandbox_result.get("output", "") + if output and output != "(no output)": + result["output"] = output + + if files_created: + result["files_created"] = files_created + result["output_file"] = files_created[0] + + if not result: + result["output"] = "(no output)" + + return result + + def _generate_code_execution_prompt(self) -> str: + """Generate a prompt section describing the code execution environment. + + Covers sandbox mode (blocklist restrictions vs docker/lambda permissiveness) + and any pre-loaded globals available in the sandbox. + + Subclasses can override to add domain-specific guidance, but should call + super() to include the base sandbox information. + """ + if not self._allow_code_execution: + return "" + + lines: list[str] = ["\n## Code Execution Environment"] + + if not self.has_workspace: + lines.append( + "\nNo workspace is configured for this agent." + "\nExecution is compute-only: filesystem access is disabled." + "\n`open()` and `Path` file operations are unavailable." + ) + + # Globals section + if self._code_execution_globals: + global_names = ", ".join(f"`{k}`" for k in sorted(self._code_execution_globals)) + lines.append( + f"\nPre-loaded globals available in `execute_python`: {global_names}." + "\nUse these directly — do NOT re-import or re-define them." + ) + + # Sandbox-specific guidance + if self._sandbox_mode == "blocklist": + blocked_modules_str = ", ".join(sorted(BLOCKED_MODULES)) + blocked_patterns_str = ", ".join( + f"`{p}`" for p, _ in BLOCKED_PATTERNS if p != "open(" + ) + path_rule = ( + "\n- `Path` is already pre-loaded — use it directly, do NOT `import pathlib`" + if self.has_workspace + else "\n- `Path` is not available without workspace-backed file scope" + ) + open_rule = ( + "\n- `open()` is already pre-loaded — use it directly for all file I/O" + if self.has_workspace + else "\n- `open()` is blocked when no workspace is configured" + ) + lines.append( + "\n### Sandbox Restrictions (IMPORTANT — read before writing any Python code)" + "\nYou are running in restricted sandbox mode." + "\n" + "\n**Blocked imports** — do NOT import any of these modules:" + f"\n{blocked_modules_str}" + "\n" + "\n**Blocked code patterns** — do NOT use any of these in your code:" + f"\n{blocked_patterns_str}" + "\n" + "\n**Safe imports you CAN use:**" + "\n`collections`, `re`, `datetime`, `math`, `itertools`, `functools`," + " `operator`, `string`, `textwrap`, `decimal`, `fractions`," + " `statistics`, `urllib.parse`, `hashlib`, `hmac`, `base64`," + " `copy`, `pprint`, `dataclasses`, `enum`, `typing`" + "\n" + "\n**Key rules to avoid errors:**" + "\n- Do NOT `import os`, `import pathlib`, `import sys`, or any blocked module" + f"{path_rule}" + f"{open_rule}" + "\n- Do NOT use `getattr()` — use dict access: `obj[\"key\"]` or `obj.get(\"key\")`" + ) + else: + # Docker or Lambda — more permissive + lines.append( + f"\nSandbox mode: **{self._sandbox_mode}** (full Python environment available)." + "\nYou have access to the standard library and common packages." + + ( + "\nFile I/O is scoped to the workspace directory." + if self.has_workspace + else "\nNo workspace attached: file I/O is disabled for this run." + ) + ) + + return "\n".join(lines) + + def _maybe_persist_tool_result( + self, + tool_name: str, + tool_meta: _ToolMeta, + tool_result: Any, + ) -> Any: + persist_mode = tool_meta.persist + if persist_mode == ToolResultPersistMode.NEVER: + return tool_result + + serialized, content_type = _serialize_tool_result(tool_result) + char_count = len(serialized) + + if persist_mode == ToolResultPersistMode.OVERFLOW and char_count <= tool_meta.max_characters: + return tool_result + + safe_tool_name = "".join(c if c.isalnum() or c in ("_", "-") else "_" for c in tool_name) + extension = ".json" if content_type == "json" else ".txt" + filename = f"{datetime.now().strftime('%Y-%m-%d-%H%M%S-%f')}-{safe_tool_name}_result{extension}" + is_truncated = char_count > tool_meta.max_characters + preview_limit = min(tool_meta.max_characters, PERSISTED_TOOL_PREVIEW_MAX_CHARS) + preview = serialized[:preview_limit] + if char_count > preview_limit: + preview += f"\n... (preview truncated, {char_count} total chars)" + + workspace = self._workspace + if workspace is None: + logger.debug( + "Skipping persistence for '%s': no workspace configured", + tool_name, + ) + return tool_result + + try: + workspace.ensure_dirs() + ref = workspace.save_artifact( + source="raw", + filename=filename, + content=serialized, + tool_name=tool_name, + content_type=content_type, + metadata={ + "persist_mode": persist_mode.value, + "char_count": char_count, + "max_characters": tool_meta.max_characters, + "preview_max_characters": preview_limit, + }, + ) + logger.debug( + "Persisted tool result for '%s' as artifact %s (%s chars)", + tool_name, + ref.artifact_id, + char_count, + ) + return { + "tool_name": tool_name, + "persist_mode": persist_mode.value, + "artifact_id": ref.artifact_id, + "artifact_path": ref.relative_path, + "truncated": is_truncated, + "preview": preview, + "_hint": ( + "Full tool result saved to workspace raw artifacts. " + f"Read artifact_id={ref.artifact_id} (path: {ref.relative_path}) to inspect complete output." + ), + } + except Exception as e: + logger.warning("Failed to persist tool result for '%s': %s", tool_name, e) + return tool_result + ## Properties @property @@ -243,6 +618,284 @@ def chat_thread_id(self) -> str: """Return the current thread ID.""" return self._thread.id + @property + def has_workspace(self) -> bool: + """Whether this agent has an attached workspace.""" + return self._workspace is not None + + @property + def autonomous_iteration(self) -> int: + """Return the current/final autonomous iteration count.""" + return self._autonomous_iteration + + @property + def can_finalize(self) -> bool: + """ + Whether finalize tools should be available. + + Finalize tools are available only in autonomous mode after min_iterations. + """ + return ( + self.execution_mode == AgentExecutionMode.AUTONOMOUS + and self._autonomous_iteration >= self._autonomous_config.min_iterations + ) + + @property + def has_output_schema(self) -> bool: + """Whether an orchestrator-defined output schema has been set.""" + return self._task_output_schema is not None + + ## Autonomous extension points + + def _require_workspace(self) -> AgentWorkspace: + """Return workspace or raise a clear runtime error when unavailable.""" + if self._workspace is None: + raise RuntimeError( + "This agent has no workspace configured. " + "Workspace-scoped tools are unavailable.", + ) + return self._workspace + + def _get_autonomous_system_prompt(self) -> str: + """ + Return system prompt for autonomous mode. + + Subclasses can override this for custom autonomous behavior. + """ + return ( + self._get_system_prompt() + + dedent(""" + + ## Autonomous Execution + - Operate independently and use tools to complete the task. + - Keep reasoning concise and tool-driven. + - Use `add_note()` for warnings, assumptions, and blockers. + - Finalize only via the designated finalize tool. + """) + + self._get_output_schema_prompt_section() + + self._get_urgency_notice() + ) + + def _get_autonomous_initial_message(self, task: str) -> str: + """ + Build initial USER message for autonomous mode. + + Subclasses can override this for custom autonomous task framing. + """ + finalize_call = ( + "finalize_with_output(output={...})" + if self.has_output_schema + else "finalize_result(output={...})" + ) + return dedent( + f""" + Task: {task} + + Run autonomously until complete. + Use available tools to gather evidence and produce the best possible output. + When done, call `{finalize_call}`. + """ + ).strip() + + def _check_autonomous_completion(self, tool_name: str) -> bool: + """ + Check whether a tool call signals autonomous completion. + + Default implementation checks generic finalize tool names and + requires a wrapped result to be present. + """ + finalize_tools = ( + "finalize_with_output", + "finalize_with_failure", + "finalize_result", + "finalize_failure", + ) + if tool_name in finalize_tools: + return self._wrapped_result is not None + return False + + def _get_autonomous_result(self) -> BaseModel | None: + """ + Return autonomous run result. + + Subclasses may override this to map to specialized result objects. + """ + return self._wrapped_result + + def _reset_autonomous_state(self) -> None: + """ + Reset autonomous-mode state before a new run. + + Subclasses can extend this to clear their own autonomous fields. + """ + self._task_output_schema = None + self._task_output_description = None + self._notes = [] + self._wrapped_result = None + self._finalize_with_output_failed = False + + ## Output schema helpers for autonomous runs + + def set_output_schema( + self, + schema: dict[str, Any], + description: str | None = None, + ) -> None: + """Set expected output schema for the current autonomous task.""" + self._task_output_schema = schema + self._task_output_description = description + + def _get_output_schema_prompt_section(self) -> str: + """Return formatted output schema prompt section for autonomous mode.""" + if not self._task_output_schema: + return "" + + parts = ["\n\n## Expected Output Schema\n"] + if self._task_output_description: + parts.append(f"**Description:** {self._task_output_description}\n\n") + parts.append("**Schema:**\n```json\n") + parts.append(json.dumps(self._task_output_schema, indent=2)) + parts.append("\n```\n") + parts.append( + "\nWhen ready, call `finalize_with_output(output={...})` with data matching this schema. " + "Use `add_note()` before finalizing to record notes, warnings, or errors." + ) + return "".join(parts) + + def _get_urgency_notice(self) -> str: + """Iteration-aware urgency notice for autonomous prompts.""" + finalize_tool = ( + "`finalize_with_output(output={...})`" + if self.has_output_schema + else "`finalize_result(output={...})`" + ) + if self.can_finalize: + remaining = self._autonomous_config.max_iterations - self._autonomous_iteration + if remaining <= 2: + return f"\n\n## URGENT: Only {remaining} iteration(s) left — call {finalize_tool} NOW." + if remaining <= 4: + return f"\n\n## Finalize soon — {remaining} iterations remaining." + return f"\n\n## {finalize_tool} is now available." + return f"\n\n## Continue exploring (iteration {self._autonomous_iteration})." + + @agent_tool( + availability=lambda self: ( + self.execution_mode == AgentExecutionMode.AUTONOMOUS + ) + ) + def add_note(self, note: str) -> dict[str, Any]: + """ + Add a note to the autonomous result wrapper. + + Args: + note: Note, warning, complaint, or error to include in final output. + """ + self._notes.append(note) + return {"status": "ok", "total_notes": len(self._notes)} + + @agent_tool(availability=lambda self: self.can_finalize and self.has_output_schema, token_optimized=True) + def _finalize_with_output(self, output: dict[str, Any]) -> dict[str, Any]: + """ + Finalize with output matching the orchestrator's expected schema. + + Args: + output: Result data matching the configured JSON schema. + """ + if not self._task_output_schema: + return {"error": "No output schema defined for this task"} + + try: + jsonschema.validate(instance=output, schema=self._task_output_schema) + except jsonschema.ValidationError as e: + self._finalize_with_output_failed = True + return { + "error": "VALIDATION FAILED — output does not match the expected schema.", + "validation_error": str(e.message), + "schema_path": list(e.absolute_schema_path), + "hint": "Fix the output structure and call finalize_with_output again.", + } + + self._finalize_with_output_failed = False + self._wrapped_result = SpecialistResultWrapper( + output=output, + success=True, + notes=self._notes.copy(), + failure_reason=None, + ) + return { + "status": "success", + "message": "Output validated and stored successfully", + "notes_count": len(self._notes), + } + + @agent_tool(availability=lambda self: self.can_finalize and self.has_output_schema, token_optimized=True) + def _finalize_with_failure(self, reason: str) -> dict[str, Any]: + """ + Finalize with failure when a schema-based task cannot be completed. + + Args: + reason: Why the task could not be completed. + """ + if self._finalize_with_output_failed: + return { + "error": ( + "REJECTED — finalize_with_output already failed validation. " + "Fix output and retry finalize_with_output instead of giving up." + ), + } + + self._wrapped_result = SpecialistResultWrapper( + output=None, + success=False, + notes=self._notes.copy(), + failure_reason=reason, + ) + return { + "status": "failure", + "message": "Task marked as failed", + "reason": reason, + } + + @agent_tool(availability=lambda self: self.can_finalize and not self.has_output_schema, token_optimized=True) + def _finalize_result(self, output: dict[str, Any]) -> dict[str, Any]: + """ + Finalize and submit result data for tasks without a predefined schema. + + Args: + output: Result payload. + """ + self._wrapped_result = SpecialistResultWrapper( + output=output, + success=True, + notes=self._notes.copy(), + failure_reason=None, + ) + return { + "status": "success", + "message": "Result submitted successfully", + "notes_count": len(self._notes), + } + + @agent_tool(availability=lambda self: self.can_finalize and not self.has_output_schema, token_optimized=True) + def _finalize_failure(self, reason: str) -> dict[str, Any]: + """ + Finalize with failure for tasks without a predefined schema. + + Args: + reason: Why the task could not be completed. + """ + self._wrapped_result = SpecialistResultWrapper( + output=None, + success=False, + notes=self._notes.copy(), + failure_reason=reason, + ) + return { + "status": "failure", + "message": "Task marked as failed", + "reason": reason, + } + ## Public API def get_thread(self) -> ChatThread: @@ -255,6 +908,11 @@ def get_chats(self) -> list[Chat]: def reset(self) -> None: """Reset the conversation to a fresh state.""" + # Reset autonomous mode state + self.execution_mode = AgentExecutionMode.CONVERSATIONAL + self._autonomous_iteration = 0 + self._reset_autonomous_state() + old_chat_thread_id = self._thread.id self._thread = ChatThread() self._thread_persisted = False @@ -269,6 +927,45 @@ def reset(self) -> None: ## Tool registration and dispatch + def _get_tool_registration_payload(self, tool_meta: _ToolMeta) -> tuple[str, dict[str, Any]]: + """ + Return (description, parameters) used when registering a tool with the LLM. + + Subclasses can override to dynamically customize registration metadata + without reimplementing _sync_tools. + """ + description, parameters = tool_meta.description, tool_meta.parameters + if tool_meta.name == "finalize_result": + description = ( + "Finalize and submit result data. " + "You MUST call this with a non-empty `output` object: " + "`finalize_result(output={...})`. Do NOT call with empty arguments." + ) + return description, parameters + + if tool_meta.name != "finalize_with_output" or not self._task_output_schema: + return description, parameters + + parameters = { + "type": "object", + "properties": { + "output": self._task_output_schema, + }, + "required": ["output"], + } + desc_suffix = ( + f" Output description: {self._task_output_description}" + if self._task_output_description + else "" + ) + description = ( + "Finalize with output matching the expected schema. " + "The output parameter MUST include all required fields " + "defined in the schema — do NOT call with empty arguments." + + desc_suffix + ) + return description, parameters + def _sync_tools(self) -> None: """ Synchronize tools registered with the LLM client based on current availability. @@ -288,10 +985,11 @@ def _sync_tools(self) -> None: available = tool_meta.availability(self) if callable(tool_meta.availability) else tool_meta.availability if not available: continue + description, parameters = self._get_tool_registration_payload(tool_meta) self.llm_client.register_tool( name=tool_meta.name, - description=tool_meta.description, - parameters=tool_meta.parameters, + description=description, + parameters=parameters, ) self._registered_tool_names.add(tool_meta.name) logger.debug("Synced %s total tools: %s", len(collected_tools), self._registered_tool_names) @@ -321,13 +1019,32 @@ def _execute_tool(self, tool_name: str, tool_arguments: dict[str, Any]) -> dict[ # validate required parameters required = tool_meta.parameters.get("required", []) + valid_params = set(tool_meta.parameters.get("properties", {}).keys()) missing = [p for p in required if p not in tool_arguments or tool_arguments[p] is None] + extra = set(tool_arguments.keys()) - valid_params + + # Auto-wrap: if the tool expects a single "output" dict parameter and the LLM + # passed the dict contents as top-level kwargs instead, wrap them automatically. + # This is the #1 cause of finalize_with_output / finalize_result failures. + if ( + missing + and valid_params == {"output"} + and "output" not in tool_arguments + and tool_arguments # LLM passed *something* + ): + logger.info( + "Auto-wrapping %d top-level arg(s) into 'output' for tool '%s'", + len(tool_arguments), tool_name, + ) + tool_arguments = {"output": dict(tool_arguments)} + missing = [] + extra = set() + if missing: - return {"error": f"Missing required parameter(s): {', '.join(missing)}"} + return {"error": f"Missing required parameter(s): {', '.join(missing)}. " + f"Expected parameters: {', '.join(sorted(valid_params))}"} # validate no extra parameters - valid_params = set(tool_meta.parameters.get("properties", {}).keys()) - extra = set(tool_arguments.keys()) - valid_params if extra: return {"error": f"Unknown parameter(s) for '{tool_name}': {', '.join(sorted(extra))}"} @@ -343,18 +1060,41 @@ def _execute_tool(self, tool_name: str, tool_arguments: dict[str, Any]) -> dict[ else: validated_arguments[param_name] = value except ValidationError as e: - # extract readable error message + # extract readable error message with actionable guidance errors = e.errors() if errors: err = errors[0] - msg = f"{param_name}: expected {err.get('type', 'valid type')}, got {type(value).__name__}" + got_type = type(value).__name__ + expected = err.get("type", "valid type") + msg = f"{param_name}: expected {expected}, got {got_type}" + # Add actionable hint for common dict vs string confusion + if got_type == "str" and "dict" in expected: + msg += ( + ". You passed a string but this parameter requires a JSON object. " + 'Example: {"key": "value", "nested": {"a": 1}} — NOT a string.' + ) else: msg = str(e) return {"error": f"Invalid argument type: {msg}"} logger.debug("Executing tool %s with arguments: %s", tool_name, tool_arguments) # handler is unbound (from cls, not self) so pass self explicitly - return handler(self, **validated_arguments) + raw_result = handler(self, **validated_arguments) + result_for_llm = self._maybe_persist_tool_result( + tool_name=tool_name, + tool_meta=tool_meta, + tool_result=raw_result, + ) + if tool_meta.token_optimized: + if isinstance(result_for_llm, dict) and "artifact_id" in result_for_llm: + result_for_llm = { + **result_for_llm, + "_token_optimized_note": ( + "This chat output is token-optimized; the saved artifact contains raw output." + ), + } + return token_optimized_decorator(lambda: result_for_llm)() + return result_for_llm @classmethod @functools.lru_cache @@ -418,97 +1158,299 @@ def _get_documentation_prompt_section(self) -> str: return "\n".join(lines) + def _list_workspace_files_scoped(self, path: str) -> dict[str, Any]: + """List files under a workspace subpath.""" + if not self.has_workspace: + return {"error": "workspace scope unavailable: no workspace configured"} + normalized_path = path.strip() or "." + if normalized_path in {".", "./"}: + workspace = self._require_workspace() + result = workspace.list_files() + return {"scope": "workspace", "path": ".", **result} + + workspace_root = self._require_workspace().root_path.resolve() + resolved = (workspace_root / normalized_path).resolve() + try: + resolved.relative_to(workspace_root) + except ValueError: + return {"error": f"Access denied: '{path}' is outside the workspace directory"} + if not resolved.exists(): + return {"error": f"Path not found: {path}"} + if resolved.is_file(): + return { + "scope": "workspace", + "path": normalized_path, + "type": "file", + "total_files": 1, + "files": [{ + "path": normalized_path, + "size_bytes": resolved.stat().st_size, + }], + } + + tree_lines: list[str] = [f"{Path(normalized_path).name or normalized_path}/"] + total_files = 0 + for dirpath, dirnames, filenames in sorted(resolved.walk()): + rel_dir = dirpath.relative_to(resolved) + depth = len(rel_dir.parts) + indent = " " * depth + if depth > 0: + tree_lines.append(f"{indent}{rel_dir.name}/") + dirnames.sort() + for filename in sorted(filenames): + tree_lines.append(f"{indent} {filename}") + total_files += 1 + + return { + "scope": "workspace", + "path": normalized_path, + "tree": "\n".join(tree_lines), + "total_files": total_files, + } + + def _list_docs_files(self, file_type: str | None, top_n: int) -> dict[str, Any]: + """List indexed documentation/code files.""" + if self._documentation_data_loader is None: + return {"error": "docs scope unavailable: documentation_data_loader is not configured"} + + normalized_file_type = file_type.strip().lower() if file_type else None + if normalized_file_type and normalized_file_type not in {"documentation", "code"}: + return {"error": "file_type must be 'documentation' or 'code'"} + + rows: list[dict[str, Any]] = [] + for entry in self._documentation_data_loader.entries: + if normalized_file_type and entry.file_type.value != normalized_file_type: + continue + rows.append({ + "path": str(entry.path), + "file_type": entry.file_type.value, + "title": entry.title, + "summary": entry.summary, + "size_bytes": entry.size_bytes, + }) + + rows.sort(key=lambda row: row["path"]) + return { + "scope": "docs", + "file_type": normalized_file_type, + "total_files": len(rows), + "files": rows[:max(1, top_n)], + } + + def _search_workspace_files( + self, + query: str, + mode: str, + case_sensitive: bool, + top_n: int, + ) -> dict[str, Any]: + """Search text files in the workspace.""" + if not self.has_workspace: + return {"error": "workspace scope unavailable: no workspace configured"} + if not query: + return {"error": "query is required"} + if mode not in {"exact", "terms", "regex"}: + return {"error": "mode must be 'exact', 'terms', or 'regex'"} + + workspace_root = self._require_workspace().root_path.resolve() + compiled_regex: re.Pattern[str] | None = None + if mode == "regex": + flags = 0 if case_sensitive else re.IGNORECASE + try: + compiled_regex = re.compile(query, flags) + except re.error as e: + return {"error": f"Invalid regex pattern: {e}"} + + search_query = query if case_sensitive else query.lower() + terms = _parse_search_terms(query) + normalized_terms = terms if case_sensitive else [t.lower() for t in terms] + + results: list[dict[str, Any]] = [] + for file_path in workspace_root.rglob("*"): + if not file_path.is_file(): + continue + try: + if file_path.stat().st_size > 512_000: + continue + content = file_path.read_text(encoding="utf-8") + except (UnicodeDecodeError, OSError): + continue + if "\x00" in content: + continue + + score = 0 + matches: list[dict[str, Any]] = [] + for line_number, line in enumerate(content.splitlines(), start=1): + search_line = line if case_sensitive else line.lower() + line_hits = 0 + if mode == "exact": + line_hits = search_line.count(search_query) + elif mode == "terms": + for term in normalized_terms: + line_hits += search_line.count(term) + else: + assert compiled_regex is not None + line_hits = len(list(compiled_regex.finditer(line))) + + if line_hits > 0: + score += line_hits + matches.append({ + "line_number": line_number, + "line_content": line.strip(), + "hits": line_hits, + }) + if len(matches) >= 10: + break + + if score == 0: + continue + results.append({ + "path": str(file_path.relative_to(workspace_root)), + "score": score, + "matches": matches, + }) + + results.sort(key=lambda row: row["score"], reverse=True) + capped_results = results[:max(1, top_n)] + if not capped_results: + return {"scope": "workspace", "mode": mode, "query": query, "message": f"No matches found for '{query}'"} + return { + "scope": "workspace", + "mode": mode, + "query": query, + "files_with_matches": len(capped_results), + "results": capped_results, + } + @agent_tool( - availability=lambda self: self._documentation_data_loader is not None, + availability=lambda self: self.has_workspace or self._documentation_data_loader is not None, + persist=ToolResultPersistMode.NEVER, + token_optimized=True, parameters={ "type": "object", "properties": { - "query": { + "scope": { + "type": "string", + "enum": ["workspace", "docs"], + "description": "Where to list files from.", + }, + "path": { "type": "string", - "description": "The exact string to search for.", + "description": "Workspace path to list (workspace scope only). Defaults to '.'.", }, "file_type": { "type": "string", "enum": ["documentation", "code"], - "description": "Optional filter by file type.", + "description": "Optional docs-only file type filter.", }, - "case_sensitive": { - "type": "boolean", - "description": "Whether the search should be case-sensitive. Defaults to false.", + "top_n": { + "type": "integer", + "description": "Max docs files to return. Defaults to 200.", }, }, - "required": ["query"], + "required": ["scope"], }, ) - @token_optimized - def _search_docs( + def _list_files( self, - query: str, + scope: str, + path: str = ".", file_type: str | None = None, - case_sensitive: bool = False, + top_n: int = 200, ) -> dict[str, Any]: """ - Search documentation/code file contents for an exact query string (like Cmd+F). - - Returns line numbers where matches are found. Use get_doc_file to read around those lines. + List files from workspace or docs. Args: - query: The exact string to search for. - file_type: Optional filter: 'documentation' for docs, 'code' for source files. - case_sensitive: Whether the search should be case-sensitive. Defaults to false. + scope: Either 'workspace' or 'docs'. + path: Workspace subpath to list when scope='workspace'. Defaults to '.'. + file_type: Optional docs-only filter. + top_n: Max docs entries to return. """ - if not query: - return {"error": "query is required"} - - file_type_enum = FileType(file_type) if file_type else None - - results = self._documentation_data_loader.search_content_with_lines( - query=query, - file_type=file_type_enum, - case_sensitive=case_sensitive, - max_matches_per_file=10, - ) - - if not results: - return {"message": f"No matches found for '{query}'", "case_sensitive": case_sensitive} + try: + normalized_scope = _normalize_file_scope(scope) + except ValueError as e: + return {"error": str(e)} - return { - "query": query, - "case_sensitive": case_sensitive, - "files_with_matches": len(results), - "results": results[:20], - } + if normalized_scope == "workspace": + return self._list_workspace_files_scoped(path) + return self._list_docs_files(file_type=file_type, top_n=top_n) - @agent_tool(availability=lambda self: self._documentation_data_loader is not None) - @token_optimized - def _get_doc_file( + @agent_tool( + availability=lambda self: self.has_workspace or self._documentation_data_loader is not None, + persist=ToolResultPersistMode.NEVER, + token_optimized=True, + parameters={ + "type": "object", + "properties": { + "scope": { + "type": "string", + "enum": ["workspace", "docs"], + "description": "Where to read the file from.", + }, + "path": { + "type": "string", + "description": "Path to file. Workspace paths are relative to workspace root.", + }, + "start_line": { + "type": "integer", + "description": "Optional 1-based start line number.", + }, + "end_line": { + "type": "integer", + "description": "Optional 1-based end line number (inclusive).", + }, + }, + "required": ["scope", "path"], + }, + ) + def _read_file( self, + scope: str, path: str, start_line: int | None = None, end_line: int | None = None, ) -> dict[str, Any]: """ - Read documentation/code file content by path. - - Supports optional line range. Use start_line/end_line to read around matches from search_docs. + Read a file from workspace or docs. Args: - path: The file path (can be partial, will match). - start_line: Starting line number (1-indexed, inclusive). Omit for beginning. - end_line: Ending line number (1-indexed, inclusive). Omit to read to end. + scope: Either 'workspace' or 'docs'. + path: Path to file (workspace-relative when scope='workspace'). + start_line: Optional 1-based start line number. + end_line: Optional 1-based end line number (inclusive). """ if not path: return {"error": "path is required"} + try: + normalized_scope = _normalize_file_scope(scope) + except ValueError as e: + return {"error": str(e)} + + if normalized_scope == "workspace": + if not self.has_workspace: + return {"error": "workspace scope unavailable: no workspace configured"} + workspace = self._require_workspace() + return {"scope": "workspace", **workspace.read_file(path, start_line=start_line, end_line=end_line)} + + if self._documentation_data_loader is None: + return {"error": "docs scope unavailable: documentation_data_loader is not configured"} if start_line is not None or end_line is not None: - result = self._documentation_data_loader.get_file_lines( - path=path, start_line=start_line, end_line=end_line, - ) + result = self._documentation_data_loader.get_file_lines(path, start_line=start_line, end_line=end_line) if result is None: return {"error": f"File '{path}' not found"} - content, total_lines = result + content_lines = content.count("\n") + (1 if content else 0) + read_start = start_line or 1 + read_end = read_start + max(content_lines - 1, 0) + if len(content) > 5_000: + content = ( + content[:5_000] + + f"\n... [output too large... read lines {read_start} - {read_end}]" + ) return { + "scope": "docs", "path": path, "lines_shown": f"{start_line or 1}-{end_line or total_lines}", "total_lines": total_lines, @@ -521,13 +1463,12 @@ def _get_doc_file( content = entry.content total_lines = content.count("\n") + 1 - - if len(content) > 10000: - content = content[:10000] + f"\n... (truncated, {len(entry.content)} total chars)" - + if len(content) > 5_000: + content = content[:5_000] + f"\n... [output too large... read lines 1 - {total_lines}]" return { + "scope": "docs", "path": str(entry.path), - "file_type": entry.file_type, + "file_type": entry.file_type.value, "title": entry.title, "summary": entry.summary, "total_lines": total_lines, @@ -535,72 +1476,145 @@ def _get_doc_file( } @agent_tool( - availability=lambda self: self._documentation_data_loader is not None, + availability=lambda self: self.has_workspace or self._documentation_data_loader is not None, + persist=ToolResultPersistMode.OVERFLOW, + token_optimized=True, parameters={ "type": "object", "properties": { - "terms": { - "type": "array", - "items": {"type": "string"}, - "description": "List of search terms (case-insensitive).", + "scope": { + "type": "string", + "enum": ["workspace", "docs"], + "description": "Where to search files.", }, - "top_n": { - "type": "integer", - "description": "Number of top results to return. Defaults to 20.", + "query": { + "type": "string", + "description": "Search query.", }, - }, - "required": ["terms"], - }, - ) - @token_optimized - def _search_docs_by_terms(self, terms: list[str], top_n: int = 20) -> dict[str, Any]: - """ - Search documentation files by multiple terms with relevance scoring. - - Ranks files by how many terms match and total hit count. Good for broad topic searches. - - Args: - terms: List of search terms (case-insensitive). - top_n: Number of top results to return. Defaults to 20. - """ - if not terms: - return {"error": "terms list is required"} - - results = self._documentation_data_loader.search_by_terms(terms=terms, top_n=top_n) - return {"terms": terms, "results_count": len(results), "results": results} - - @agent_tool( - availability=lambda self: self._documentation_data_loader is not None, - parameters={ - "type": "object", - "properties": { - "pattern": { + "mode": { "type": "string", - "description": "Regex pattern to search for.", + "enum": ["exact", "terms", "regex"], + "description": "Search mode. Defaults to exact.", + }, + "file_type": { + "type": "string", + "enum": ["documentation", "code"], + "description": "Optional docs-only file type filter.", + }, + "case_sensitive": { + "type": "boolean", + "description": "Whether matching is case-sensitive. Defaults to false.", }, "top_n": { "type": "integer", - "description": "Max entries to return. Defaults to 20.", + "description": "Maximum number of results to return. Defaults to 20.", }, }, - "required": ["pattern"], + "required": ["scope", "query"], }, ) - @token_optimized - def _search_docs_by_regex(self, pattern: str, top_n: int = 20) -> dict[str, Any]: + def _search_files( + self, + scope: str, + query: str, + mode: str = "exact", + file_type: str | None = None, + case_sensitive: bool = False, + top_n: int = 20, + ) -> dict[str, Any]: """ - Search documentation files by regex pattern with timeout protection. - - Returns matching snippets with context. Useful for pattern-based searches. + Search files in workspace or docs. Args: - pattern: Regex pattern to search for. - top_n: Max entries to return. Defaults to 20. + scope: Either 'workspace' or 'docs'. + query: Query string. + mode: Search mode: exact, terms, or regex. + file_type: Optional docs-only file type filter. + case_sensitive: Whether to match case-sensitively. + top_n: Maximum number of results to return. """ - if not pattern: - return {"error": "pattern is required"} + try: + normalized_scope = _normalize_file_scope(scope) + except ValueError as e: + return {"error": str(e)} + normalized_mode = mode.strip().lower() + + if normalized_scope == "workspace": + return self._search_workspace_files( + query=query, + mode=normalized_mode, + case_sensitive=case_sensitive, + top_n=top_n, + ) + + if self._documentation_data_loader is None: + return {"error": "docs scope unavailable: documentation_data_loader is not configured"} + if not query: + return {"error": "query is required"} - return self._documentation_data_loader.search_by_regex(pattern=pattern, top_n=top_n) + normalized_file_type = file_type.strip().lower() if file_type else None + if normalized_file_type and normalized_file_type not in {"documentation", "code"}: + return {"error": "file_type must be 'documentation' or 'code'"} + file_type_enum = FileType(normalized_file_type) if normalized_file_type else None + + if normalized_mode == "exact": + results = self._documentation_data_loader.search_content_with_lines( + query=query, + file_type=file_type_enum, + case_sensitive=case_sensitive, + max_matches_per_file=10, + ) + if not results: + return {"scope": "docs", "mode": "exact", "query": query, "message": f"No matches found for '{query}'"} + return { + "scope": "docs", + "mode": "exact", + "query": query, + "case_sensitive": case_sensitive, + "files_with_matches": len(results), + "results": results[:max(1, top_n)], + } + + if normalized_mode == "terms": + terms = _parse_search_terms(query) + if not terms: + return {"error": "query must contain at least one term for terms mode"} + results = self._documentation_data_loader.search_by_terms(terms=terms, top_n=top_n) + if normalized_file_type: + filtered: list[dict[str, Any]] = [] + for row in results: + entry_id = row.get("id") + if not entry_id: + continue + entry = self._documentation_data_loader.get_file_by_path(entry_id) + if entry and entry.file_type.value == normalized_file_type: + filtered.append(row) + results = filtered + return { + "scope": "docs", + "mode": "terms", + "query": query, + "terms": terms, + "results_count": len(results), + "results": results, + } + + if normalized_mode == "regex": + regex_results = self._documentation_data_loader.search_by_regex(pattern=query, top_n=top_n) + if normalized_file_type and "matches" in regex_results: + matches = regex_results.get("matches", []) + filtered_matches: list[dict[str, Any]] = [] + for match in matches: + entry_id = match.get("id") + if not entry_id: + continue + entry = self._documentation_data_loader.get_file_by_path(entry_id) + if entry and entry.file_type.value == normalized_file_type: + filtered_matches.append(match) + regex_results["matches"] = filtered_matches + return {"scope": "docs", "mode": "regex", "query": query, **regex_results} + + return {"error": "mode must be 'exact', 'terms', or 'regex'"} ## Tool availability prompt section @@ -628,6 +1642,51 @@ def _get_tool_availability_prompt_section(self) -> str: return "\n".join(lines) + def _get_workspace_usage_prompt_section(self) -> str: + """ + Build a concise workspace usage guide for the system prompt. + + Uses class-level WORKSPACE_USAGE_SECTION so specialized agents can + override the full section text. + """ + if not self.has_workspace: + return "" + section = self.WORKSPACE_USAGE_SECTION.strip() + mounted_section = self._get_mounted_inputs_prompt_section() + if not section and not mounted_section: + return "" + + parts: list[str] = [] + if section: + parts.append(section) + if mounted_section: + parts.append(mounted_section) + + return "\n\n" + "\n\n".join(parts) + + def _get_mounted_inputs_prompt_section(self) -> str: + """Build a system prompt section listing currently mounted input files.""" + if not self.has_workspace: + return "" + workspace = self._require_workspace() + mounted_inputs = workspace.list_mounted_inputs() + if not mounted_inputs: + return "" + + lines = [ + "## Mounted Input Files (read-only)", + "The following files are mounted under `raw/` and can be read directly without tools:", + "```python", + "text = open('raw/', encoding='utf-8').read()", + "blob = open('raw/', 'rb').read()", + "```", + ] + for item in mounted_inputs: + lines.append( + f"- `{item.relative_path}` — {item.size_bytes} bytes (source: `{item.source_path}`)" + ) + return "\n".join(lines) + ## LLMs and streaming def _call_llm( @@ -653,6 +1712,11 @@ def _call_llm( # append tool availability (injected here so subclasses can't accidentally omit it) system_prompt = system_prompt + self._get_tool_availability_prompt_section() + # append workspace usage guide (injected centrally for all agents) + workspace_section = self._get_workspace_usage_prompt_section() + if workspace_section: + system_prompt = system_prompt + workspace_section + # append documentation context (injected here so subclasses can't accidentally omit it) docs_section = self._get_documentation_prompt_section() if docs_section: @@ -752,6 +1816,9 @@ def _add_chat( if self._persist_chat_thread_callable: self._thread = self._persist_chat_thread_callable(self._thread) + if self._on_chat_added is not None: + self._on_chat_added(chat) + return chat def _build_messages_for_llm(self) -> list[dict[str, Any]]: @@ -872,6 +1939,119 @@ def execute_one(tc: LLMToolCall) -> tuple[LLMToolCall, str]: tool_call_id=tool_call.call_id, ) + ## autonomous loop + + def run_autonomous( + self, + task: str, + config: AutonomousRunConfig | None = None, + output_schema: dict[str, Any] | None = None, + output_description: str | None = None, + ) -> BaseModel | None: + """ + Run the agent autonomously to completion. + + Subclasses may override autonomous extension hooks for domain-specific + prompting and completion behavior. + """ + self.execution_mode = AgentExecutionMode.AUTONOMOUS + try: + self._autonomous_iteration = 0 + self._autonomous_config = config or AutonomousRunConfig() + + # Subclasses should clear specialized result fields in overrides. + self._reset_autonomous_state() + + # Set output schema after reset so it is retained for this run. + if output_schema: + self.set_output_schema(output_schema, output_description) + + initial_message = self._get_autonomous_initial_message(task) + self._add_chat(ChatRole.USER, initial_message) + + logger.info( + "Starting %s autonomous run for task: %s", + self.__class__.__name__, task, + ) + self._run_autonomous_loop() + return self._get_autonomous_result() + finally: + self.execution_mode = AgentExecutionMode.CONVERSATIONAL + + def _run_autonomous_loop(self) -> None: + """Run the autonomous loop with iteration tracking and finalize gating.""" + max_iterations = self._autonomous_config.max_iterations + for iteration in range(max_iterations): + self._autonomous_iteration = iteration + 1 + logger.debug( + "Autonomous loop iteration %d/%d", + self._autonomous_iteration, + max_iterations, + ) + + messages = self._build_messages_for_llm() + try: + response = self._call_llm( + messages, + self._get_autonomous_system_prompt(), + tool_choice="required", + ) + + if response.response_id: + self._previous_response_id = response.response_id + + if response.content or response.tool_calls: + chat = self._add_chat( + role=ChatRole.ASSISTANT, + content=response.content or "", + tool_calls=response.tool_calls if response.tool_calls else None, + llm_provider_response_id=response.response_id, + ) + if response.content: + self._emit_message( + ChatResponseEmittedMessage( + content=response.content, + chat_id=chat.id, + chat_thread_id=self._thread.id, + ) + ) + + if not response.tool_calls: + logger.warning( + "Autonomous loop: no tool calls in iteration %d (unexpected with tool_choice=required)", + self._autonomous_iteration, + ) + return + + for tool_call in response.tool_calls: + result_str = self._auto_execute_tool( + tool_call.tool_name, + tool_call.tool_arguments, + ) + + self._add_chat( + role=ChatRole.TOOL, + content=f"Tool '{tool_call.tool_name}' result: {result_str}", + tool_call_id=tool_call.call_id, + ) + + if self._check_autonomous_completion(tool_call.tool_name): + logger.debug( + "Autonomous run completed at iteration %d", + self._autonomous_iteration, + ) + return + + except Exception as e: + logger.exception("Error in autonomous loop: %s", e) + self._emit_message(ErrorEmittedMessage(error=str(e))) + return + + logger.warning( + "Autonomous loop hit max iterations (%d) without finalization", + max_iterations, + ) + ## agent loop (basic implementation, can be overridden) def _run_agent_loop(self) -> None: diff --git a/bluebox/agents/bluebox_agent.py b/bluebox/agents/bluebox_agent.py index 0364ab8e..05772999 100644 --- a/bluebox/agents/bluebox_agent.py +++ b/bluebox/agents/bluebox_agent.py @@ -22,7 +22,7 @@ import requests from bluebox.agents.abstract_agent import AbstractAgent, AgentCard, agent_tool -from bluebox.agents.workspace import AgentWorkspace, LocalWorkspace +from bluebox.workspace import AgentWorkspace from bluebox.config import Config from bluebox.data_models.agents.context import BlueBoxAgentContext, UsedRoutine from bluebox.data_models.browser_agent import ( @@ -43,14 +43,6 @@ ) from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel from bluebox.data_models.routine.routine import RoutineExecutionRequest, RoutineInfo -from bluebox.utils.code_execution_sandbox import ( - BLOCKED_MODULES, - BLOCKED_PATTERNS, - execute_python_sandboxed, - get_active_sandbox_mode, - get_workaround_for_error, -) -from bluebox.utils.llm_utils import token_optimized from bluebox.utils.logger import get_logger logger = get_logger(name=__name__) @@ -80,32 +72,28 @@ class BlueBoxAgent(AbstractAgent): 1. **Search broadly**: When the user makes a request, use `search_routines` with a task description that describes what the user wants to do. This runs semantic search, so add some detail. You can run this multiple times if needed to get more results. 2. **Execute all relevant routines**: Run ALL routines that could plausibly fulfill the user's request via `execute_routines_in_parallel`. When in doubt, include the routine — running an extra routine is cheap, missing a relevant one is costly. Each routine execution requires a `routine_id` from the search results and a `parameters` dict keyed by parameter name with the corresponding value (e.g. {"origin": "New York", "date": "2025-03-01"}). Make sure to provide all required parameters as listed in the search results. 3. **Fallback to browser agent**: If NO routines match after thorough searching, use `execute_browser_task` to perform the task via an AI-driven browser agent. Write a clear, detailed natural language instruction for the task. - 4. **Post-process results**: Use `run_python_code` to transform routine results into clean output files (CSV, JSON, JSONL, etc.) for the user. - 5. **Verify output**: After writing files, use `list_workspace_files` and `read_workspace_file` to verify the output looks correct. If it doesn't, fix the code and rerun. + 4. **Post-process results**: Use `execute_python` to transform routine results into clean output files (CSV, JSON, JSONL, etc.) for the user. + 5. **Verify output**: After writing files, use `list_files(scope="workspace")` and `read_file(scope="workspace", path=...)` to verify the output looks correct. If it doesn't, fix the code and rerun. 6. **Report results**: Summarize what was executed and the output files to the user. ## Workspace Your workspace has the following structure: - - `raw/` — routine result JSON files, saved automatically when routines execute - - `outputs/` — write all your generated output files here (CSV, JSON, JSONL, etc.) + - `raw/` (read-only) — routine result JSON files and mounted inputs + - `output/` — write all your generated output files here (CSV, JSON, JSONL, etc.) - `context/` — context files (JSON + Markdown) saved by `generate_context`, used for session replay + - `meta/` (read-only) — system-managed manifests and metadata - **Pre-loaded variables in `run_python_code`:** - - `routine_results` — list of dicts, one per JSON file in raw/ - - `json` — for parsing and serialization - - `csv` — for CSV reading/writing - - `Path` — from pathlib, for path operations (do NOT import pathlib — use `Path` directly) - - `open()` — scoped to the workspace directory for safe file I/O + **Reading routine outputs in `execute_python`:** + - Use `list_files(scope="workspace")` to see files in `raw/` + - Read raw JSON files directly in Python: + `records = [json.loads(p.read_text()) for p in Path("raw").glob("*.json")]` + - Use `read_file(scope="workspace", path="...")` to inspect any file by relative path (e.g. "raw/25-01-15-143052-routine_result_1.json" or "output/results.csv"). Use optional start_line/end_line for large files. **Writing output files:** - - Write to the outputs/ subdirectory: `with open("outputs/results.csv", "w") as f: ...` - - **Inspecting files:** - - Use `list_workspace_files` to see all files in the workspace - - Use `read_workspace_file` to read any file by relative path (e.g. "raw/25-01-15-143052-routine_result_1.json" or "outputs/results.csv"). Use optional start_line/end_line for large files. + - Write to the output/ subdirectory: `with open("output/results.csv", "w") as f: ...` ## Routine Result Structure - Each entry in `routine_results` is the raw API response JSON saved by `execute_routines_in_parallel`. The structure is: + Each JSON file in `raw/` from `execute_routines_in_parallel` has this structure: ``` { @@ -122,16 +110,16 @@ class BlueBoxAgent(AbstractAgent): } ``` - **Path to the payload:** `rr["result"]["data"]` for each `rr` in `routine_results`. - **Input parameters:** `rr["parameters"]` for each `rr` in `routine_results`. + **Path to the payload:** `record["result"]["data"]`. + **Input parameters:** `record["parameters"]`. - **Important:** The payload shape varies per routine — different routines return different key names and structures. Always start your post-processing code by printing `rr["routine_name"]` and `rr["result"]["data"].keys()` to understand what each routine returned before trying to extract specific fields. + **Important:** The payload shape varies per routine — different routines return different key names and structures. Always inspect a few raw records first before extracting fields. ## Post-Processing with Python - - After routines return results, ALWAYS use `run_python_code` to post-process data and generate clean output files. + - After routines return results, ALWAYS use `execute_python` to post-process data and generate clean output files. - **ALWAYS add debug print() statements** in your code so you can see what's happening: print key counts, data shapes, sample values, etc. stdout is captured and returned to you. - - **On first pass, always explore the data**: before writing any output file, print the routine names and top-level keys of each result's payload so you understand the shape. Then write extraction code. - - **Be persistent**: If your code errors or produces unexpected results, read the error/output carefully, use `list_workspace_files` and `read_workspace_file` to inspect the data, fix the code, and try again. Keep iterating until you produce the correct output file. NEVER give up after one failed attempt — debug and retry. + - **On first pass, always explore the data**: before writing any output file, load records from `raw/*.json`, print routine names and top-level keys, then write extraction code. + - **Be persistent**: If your code errors or produces unexpected results, read the error/output carefully, use `list_files(scope="workspace")` and `read_file(scope="workspace", path=...)` to inspect the data, fix the code, and try again. Keep iterating until you produce the correct output file. NEVER give up after one failed attempt — debug and retry. ## Important Rules - **Always prefer routines over `execute_browser_task`**. Routines are faster, cheaper, and more reliable. Only use the browser agent as a fallback when no suitable routine exists. @@ -146,13 +134,13 @@ class BlueBoxAgent(AbstractAgent): def __init__( self, emit_message_callable: Callable[[EmittedMessage], None], + workspace: AgentWorkspace, persist_chat_callable: Callable[[Chat], Chat] | None = None, persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None, stream_chunk_callable: Callable[[str], None] | None = None, llm_model: LLMModel = OpenAIModel.GPT_5_2, chat_thread: ChatThread | None = None, existing_chats: list[Chat] | None = None, - workspace: AgentWorkspace | None = None, auth_headers_provider: Callable[[], dict[str, str]] | None = None, on_llm_response: Callable[[LLMChatResponse], None] | None = None, context_file: str | None = None, @@ -168,7 +156,7 @@ def __init__( llm_model: The LLM model to use for conversation. chat_thread: Existing ChatThread to continue, or None for new conversation. existing_chats: Existing Chat messages if loading from persistence. - workspace: Workspace for file I/O. Defaults to LocalWorkspace if not provided. + workspace: Workspace for file I/O. auth_headers_provider: Optional callback that returns auth headers for downstream API calls. If not provided, falls back to Config.VECTORLY_SERVICE_TOKEN. on_llm_response: Optional callback invoked after each LLM call with the response (for token tracking). @@ -181,7 +169,7 @@ def __init__( if not auth_headers_provider and not Config.VECTORLY_SERVICE_TOKEN: raise ValueError("Either auth_headers_provider or VECTORLY_SERVICE_TOKEN must be provided") - self._workspace = workspace or LocalWorkspace() + self._workspace = workspace self._routine_cache: dict[str, RoutineInfo] = {} self._routine_execution_counter = itertools.count( self._get_next_routine_result_index() @@ -192,6 +180,7 @@ def __init__( super().__init__( emit_message_callable=emit_message_callable, + workspace=self._workspace, persist_chat_callable=persist_chat_callable, persist_chat_thread_callable=persist_chat_thread_callable, stream_chunk_callable=stream_chunk_callable, @@ -200,12 +189,9 @@ def __init__( existing_chats=existing_chats, documentation_data_loader=None, on_llm_response=on_llm_response, + allow_code_execution=True, ) - # Detect sandbox mode once (work_dir is always set for BlueBoxAgent) - self._sandbox_mode = get_active_sandbox_mode(work_dir_set=True) - self._is_blocklist_mode = self._sandbox_mode == "blocklist" - logger.debug( "BlueBoxAgent initialized with model: %s, chat_thread_id: %s, sandbox_mode: %s, has_context: %s", llm_model, @@ -237,41 +223,11 @@ def _get_system_prompt(self) -> str: now = datetime.now() time_info = f"\n\n## Current Time\n{now.strftime('%Y-%m-%d %H:%M:%S %Z').strip()}" prompt = self.SYSTEM_PROMPT + time_info - if self._is_blocklist_mode: - prompt += self._get_blocklist_sandbox_prompt_section() + prompt += self._generate_code_execution_prompt() if self._agent_context: prompt += self._get_context_prompt_section() return prompt - def _get_blocklist_sandbox_prompt_section(self) -> str: - """Build prompt section explaining blocklist sandbox restrictions.""" - blocked_modules_str = ", ".join(sorted(BLOCKED_MODULES)) - # Exclude open( from blocked patterns list since it IS available with workspace - blocked_patterns_str = ", ".join( - f"`{p}`" for p, _ in BLOCKED_PATTERNS if p != "open(" - ) - - return dedent(f""" - - ## Sandbox Restrictions (IMPORTANT — read before writing any Python code) - You are running in restricted sandbox mode. Your `run_python_code` calls have strict restrictions. - - **Blocked imports** — do NOT import any of these modules: - {blocked_modules_str} - - **Blocked code patterns** — do NOT use any of these in your code: - {blocked_patterns_str} - - **Safe imports you CAN use:** - `collections`, `re`, `datetime`, `math`, `itertools`, `functools`, `operator`, `string`, `textwrap`, `decimal`, `fractions`, `statistics`, `urllib.parse`, `hashlib`, `hmac`, `base64`, `copy`, `pprint`, `dataclasses`, `enum`, `typing` - - **Key rules to avoid errors:** - - Do NOT `import os`, `import pathlib`, `import sys`, or any blocked module - - `Path` is already pre-loaded — use it directly, do NOT `import pathlib` - - `open()` is already pre-loaded — use it directly for all file I/O - - Do NOT use `getattr()` — use dict access: `obj["key"]` or `obj.get("key")` - """).rstrip() - ## Routine cache def _cache_routines_from_response(self, response: dict[str, Any] | list[Any]) -> None: @@ -393,7 +349,7 @@ def _get_context_prompt_section(self) -> str: if len(section) > self._CONTEXT_PROMPT_MAX_CHARS: section = section[:self._CONTEXT_PROMPT_MAX_CHARS] + ( - "\n\n... (context truncated — use `read_workspace_file` to read " + "\n\n... (context truncated — use `read_file(scope=\"workspace\", path=\"...\")` to read " "the full context files in `context/` for more detail)" ) @@ -406,7 +362,19 @@ def _extract_routines_from_raw(self) -> list[UsedRoutine]: and status from a previous execution. Returns deduplicated list of successfully executed routines. """ - raw_results = self._workspace.load_raw_json() + raw_results: list[dict[str, Any]] = [] + raw_refs = sorted( + (ref for ref in self._workspace.list_artifacts("raw") if ref.relative_path.endswith(".json")), + key=lambda ref: ref.index, + ) + for ref in raw_refs: + try: + file_data = self._workspace.read_file(ref.relative_path) + content = file_data.get("content") + if isinstance(content, str): + raw_results.append(json.loads(content)) + except Exception as e: + logger.warning("Failed to parse raw JSON artifact %s: %s", ref.relative_path, e) seen: set[str] = set() routines: list[UsedRoutine] = [] for rr in raw_results: @@ -426,8 +394,7 @@ def _extract_routines_from_raw(self) -> list[UsedRoutine]: ## Tool handlers - @agent_tool() - @token_optimized + @agent_tool(token_optimized=True) def _search_routines(self, task: str) -> dict[str, Any]: """ Search for routines by keywords. Matches against routine name and description. @@ -485,11 +452,17 @@ def save_result(result: dict[str, Any]) -> dict[str, Any]: try: idx = next(self._routine_execution_counter) ts = datetime.now().strftime("%y-%m-%d-%H%M%S") - save_info = self._workspace.save_file( - "raw", f"{ts}-routine_result_{idx}.json", + ref = self._workspace.save_artifact( + "raw", + f"{ts}-routine_result_{idx}.json", json.dumps(result, indent=2, default=str), ) - result.update(save_info) + result.update( + { + "output_file": str(self._workspace.root_path / ref.relative_path), + "artifact_id": ref.artifact_id, + }, + ) except Exception as e: logger.exception("Failed to save routine result to file: %s", e) result["output_file_error"] = str(e) @@ -523,8 +496,7 @@ def _summarize_result(full_result: dict[str, Any], req: RoutineExecutionRequest) summary["_hint"] = ( f"Response truncated ({len(raw)} chars). " f"Full result saved to {full_result.get('output_file')}. " - "Use read_workspace_file to inspect the full data, or access it " - "via routine_results in run_python_code." + "Use read_file(scope='workspace', path='...') to inspect the full data, or execute_python to parse it." ) else: summary["response_preview"] = raw @@ -631,15 +603,22 @@ def _execute_browser_task( logger.error("Browser agent API call failed: %s", e) return {"error": f"Browser agent request failed: {e}"} - # Save final_result as a markdown file in outputs/ + # Save final_result as a markdown file in output/ final_result = result.get("final_result") if final_result: try: ts = datetime.now().strftime("%y-%m-%d-%H%M%S") - save_info = self._workspace.save_file( - "outputs", f"{ts}-browser_agent.md", final_result, + ref = self._workspace.save_artifact( + "output", + f"{ts}-browser_agent.md", + final_result, + ) + result.update( + { + "output_file": str(self._workspace.root_path / ref.relative_path), + "artifact_id": ref.artifact_id, + }, ) - result.update(save_info) except Exception as e: logger.exception("Failed to save browser agent result: %s", e) result["output_file_error"] = str(e) @@ -709,118 +688,6 @@ def _consume_sse_stream(self, response: requests.Response) -> dict[str, Any]: return result - @agent_tool() - def _run_python_code(self, code: str) -> dict[str, Any]: - """ - Execute Python code to post-process routine results and generate output files. - - The code runs with full read/write access to the workspace directory. - Pre-loaded variables: `routine_results` (list of dicts from all JSON files - in the raw/ directory), `json`, `csv`, and `Path` (pathlib.Path). - - Write output files to the outputs/ subdirectory: - with open("outputs/results.csv", "w") as f: ... - - IMPORTANT: Always include print() statements for debugging — print data shapes, - key names, row counts, sample values, etc. If the code fails, use the output - to diagnose and fix. Keep iterating until the output file is correct. - - Args: - code: Python code to execute. Has full file access to the workspace. - Pre-loaded: routine_results (list[dict]), json, csv, Path. - Write output files to outputs/ subdirectory. Always add print() - statements for debugging. - """ - # Ensure directories exist - self._workspace.ensure_dirs() - work_dir = str(self._workspace.root_path.resolve()) - - # Snapshot files in outputs/ before execution - files_before = self._workspace.snapshot_outputs() - - # Load all JSON files from raw/ as routine_results - routine_results = self._workspace.load_raw_json() - - # Execute in sandbox with work_dir for file access - sandbox_result = execute_python_sandboxed( - code, - extra_globals={"routine_results": routine_results}, - work_dir=work_dir, - ) - - # Diff files in outputs/ to find new/modified ones - files_created = self._workspace.diff_outputs(files_before) - - # Build response - result: dict[str, Any] = {} - - if "error" in sandbox_result: - result["error"] = sandbox_result["error"] - workaround = get_workaround_for_error(sandbox_result["error"]) - if workaround: - result["_hint"] = ( - f"Sandbox restriction: {workaround} " - "Fix the code and call run_python_code again." - ) - else: - result["_hint"] = ( - "Code failed. Read the error and stdout above carefully. " - "Use list_workspace_files and read_workspace_file to inspect the data, " - "then fix the code and call run_python_code again." - ) - - output = sandbox_result.get("output", "") - if output and output != "(no output)": - result["output"] = output - - if files_created: - result["files_created"] = files_created - result["output_file"] = files_created[0] - result["_hint"] = ( - "Files were created. Use read_workspace_file to verify the output " - "is correct (check first few lines). If not, fix the code and rerun." - ) - elif "error" not in sandbox_result: - result["output"] = result.get("output", "") or "Code ran but produced no files." - result["_hint"] = ( - "No files were created in outputs/. Make sure your code writes to " - "outputs/ (e.g. open('outputs/results.csv', 'w')). Fix and rerun." - ) - - return result - - @agent_tool() - @token_optimized - def _list_workspace_files(self) -> dict[str, Any]: - """ - List all files in the workspace directory as a tree. - - Shows the full directory structure including raw/ (routine results) - and outputs/ (generated files). - """ - return self._workspace.list_files() - - @agent_tool() - def _read_workspace_file( - self, - path: str, - start_line: int | None = None, - end_line: int | None = None, - ) -> dict[str, Any]: - """ - Read a file from the workspace by relative path. - - Use this to inspect raw routine results, verify generated output files, - or debug data issues. Supports optional line ranges for large files. - - Args: - path: Relative path within the workspace (e.g. "raw/routine_results_2024.json" - or "outputs/results.csv"). - start_line: Optional 1-based start line number. Omit to read from the beginning. - end_line: Optional 1-based end line number (inclusive). Omit to read to the end. - """ - return self._workspace.read_file(path, start_line=start_line, end_line=end_line) - ## Context generation (structured output, called by TUI slash command) def generate_context(self, focus: str | None = None) -> BlueBoxAgentContext: @@ -847,7 +714,7 @@ def generate_context(self, focus: str | None = None) -> BlueBoxAgentContext: "CRITICAL: routines_used must include every routine that was executed with exact " "routine_id, routine_name, and parameter values.\n" "Include the final working python_code snippet if post-processing was done.\n" - "Include output_files with relative paths of files written to outputs/.\n" + "Include output_files with relative paths of files written to output/.\n" ) if raw_routines: system_prompt += "\nRoutines found in execution results:\n" @@ -879,14 +746,22 @@ def generate_context(self, focus: str | None = None) -> BlueBoxAgentContext: ) # Save canonical JSON - json_save = self._workspace.save_file( - "context", "agent_context.json", context.model_dump_json(indent=2), + json_ref = self._workspace.save_artifact( + "context", + "agent_context.json", + context.model_dump_json(indent=2), ) # Save companion Markdown - md_save = self._workspace.save_file( - "context", "agent_context.md", context.to_markdown(), + md_ref = self._workspace.save_artifact( + "context", + "agent_context.md", + context.to_markdown(), ) - logger.info("Context files saved: %s, %s", json_save["output_file"], md_save["output_file"]) + logger.info( + "Context files saved: %s, %s", + self._workspace.root_path / json_ref.relative_path, + self._workspace.root_path / md_ref.relative_path, + ) return context diff --git a/bluebox/agents/principal_investigator.py b/bluebox/agents/principal_investigator.py new file mode 100644 index 00000000..ed9a0312 --- /dev/null +++ b/bluebox/agents/principal_investigator.py @@ -0,0 +1,2751 @@ +""" +bluebox/agents/principal_investigator.py + +PrincipalInvestigator (PI) agent — the orchestrator for Phase 2: Experiment-Driven +Routine Construction. + +The PI has NO browser and NO domain tools. It only: +- Reads exploration summaries (in its system prompt) +- Reads the Discovery Ledger (routines planned, experiments, proven artifacts) +- Plans what routines to build from the exploration data +- Creates experiment tasks with specific hypotheses +- Records findings and proven artifacts +- Assembles routines and submits them for inspection +- Ships a catalog of routines when done +""" + +from __future__ import annotations + +import hashlib +import json +import re +from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError, as_completed +from datetime import datetime +from textwrap import dedent +from typing import Any, Callable, TYPE_CHECKING + +from pydantic import BaseModel, ValidationError +from toon import encode as toon_encode + +from bluebox.agents.abstract_agent import ( + AbstractAgent, + AgentCard, + AutonomousRunConfig, + ToolResultPersistMode, + agent_tool, +) +from bluebox.agents.routine_inspector import RoutineInspector +from bluebox.data_models.orchestration.inspection import RoutineInspectionResult +from bluebox.workspace import AgentWorkspace +from bluebox.agents.workers.experiment_worker import ExperimentWorker +from bluebox.data_models.llms.interaction import ( + Chat, + ChatRole, + ChatThread, + EmittedMessage, +) +from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel +from bluebox.data_models.orchestration.experiment import ( + ArtifactType, + ExperimentEntry, + ExperimentStatus, + ExperimentTakeaway, + ExperimentVerdict, +) +from bluebox.data_models.orchestration.ledger import ( + DiscoveryLedger, + RoutineAttempt, + RoutineAttemptStatus, + RoutineCatalog, + RoutineSpec, + RoutineSpecStatus, + ShippedRoutine, +) +from bluebox.data_models.orchestration.task import ( + SubAgent, + Task, + TaskStatus, + SpecialistAgentType, +) +from bluebox.data_models.orchestration.state import AgentOrchestrationState +from bluebox.data_models.routine.execution import RoutineExecutionResultWithMetadata +from bluebox.data_models.routine.routine import Routine +from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader +from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader +from bluebox.llms.data_loaders.storage_data_loader import StorageDataLoader +from bluebox.llms.data_loaders.window_property_data_loader import WindowPropertyDataLoader +from bluebox.utils.logger import get_logger + +if TYPE_CHECKING: + from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader + from websocket import WebSocket + +logger = get_logger(name=__name__) + + +# --------------------------------------------------------------------------- +# Worker tool descriptions — injected into PI system prompt so the PI knows +# what workers can do and references tools by name in experiment methodologies. +# --------------------------------------------------------------------------- + +WORKER_CAPABILITIES = dedent("""\ + ## Worker Capabilities + + Workers have access to the following tools. When writing experiment methodologies, + reference these tools by name so the worker knows exactly what to use. + + BROWSER TOOLS (act in the live browser): + browser_navigate(url) — go to a URL and wait for page load. + TIP: Navigating directly to an API URL (e.g. https://api.example.com/data) + bypasses CORS restrictions since it's a top-level navigation, not a fetch. + The worker can then read the page body to get the JSON response. + browser_eval_js(expression) — run JavaScript in the page context. + Use for fetch() calls, DOM reads, clicks, storage access. + If fetch() fails with CORS, try: mode 'no-cors', or navigate to the URL first. + browser_cdp_command(method, params) — raw Chrome DevTools Protocol command. + POWERFUL: Can intercept/modify network requests below the browser security layer. + Key CDP methods for bypassing CORS/auth issues: + - Fetch.enable + Fetch.continueRequest: intercept and modify requests + - Network.enable + Network.getResponseBody: capture responses at protocol level + - Network.setExtraHTTPHeaders: add headers to all requests + - Page.navigate: navigate and capture response at CDP level + Workers should use this when browser_eval_js fetch() fails due to CORS. + browser_get_dom(selector?, max_depth?, include_tags?) — filtered view of current DOM + + RECORDED LOOKUP TOOLS (search RECORDED session data — old, potentially stale): + search_recorded_transactions(query) — find requests in the recorded capture + get_recorded_transaction(request_id) — get full recorded request/response details + USE THIS FIRST when an API call fails — it shows the exact headers, cookies, + and parameters that worked during the original recorded session. + search_recorded_storage(query) — find recorded storage events + trace_recorded_value(value) — find where a value appears across the recorded capture + get_recorded_dom_snapshot(snapshot_index?) — get recorded DOM structure + get_recorded_dom_elements(element_type, snapshot_index?) — get recorded element details + + Workers also receive the exploration summaries as shared context. +""") + + +class PrincipalInvestigator(AbstractAgent): + """ + Orchestrator agent for experiment-driven routine construction. + + The PI reads exploration summaries, plans a catalog of routines, + dispatches experiments to ExperimentWorker agents, reviews results, + and assembles proven artifacts into shipped routines. + + The PI is self-organizing — no strict phases. It decides what to work on, + when to switch routines, and when to call it done. + """ + + # Maximum time (seconds) a single worker or inspector can run before being killed. + # Covers both LLM call hangs and browser/CDP hangs. + WORKER_TIMEOUT_SECONDS: int = 180 # 3 minutes + # Minimum experiments for a specific routine before allowing mark_routine_failed. + MIN_EXPERIMENTS_BEFORE_ROUTINE_FAILURE: int = 2 + # If execution payload exceeds this size, persist it to inspector workspace raw/ + # and have the inspector analyze from file instead of inline prompt JSON. + INSPECTOR_INLINE_EXECUTION_MAX_CHARS: int = 20_000 + # Default worker experiment output schema when PI omits one. + # This enforces finalize_with_output(output={...}) instead of finalize_result. + #TODO: test if we can safely remove this without breaking the worker agent + DEFAULT_WORKER_OUTPUT_SCHEMA: dict[str, Any] = { + "type": "object", + "description": "Structured experiment findings object.", + "additionalProperties": True, + } + # Error patterns → common-issues doc paths. + # When an experiment result contains these keywords, the matching doc is + # auto-injected into the get_experiment_result response so the PI sees + # remediation guidance without having to search for it. + _ERROR_DOC_PATTERNS: list[tuple[list[str], str]] = [ + (["failed to fetch", "typeerror: failed", "cors"], "common-issues/cors-failed-to-fetch.md"), + (["401", "403", "unauthorized", "forbidden"], "common-issues/unauthenticated.md"), + (["` + - Inline config: `window.__CONFIG__ = { apiKey: "..." }` + - Data attributes: `
` + - Experiment methodology: "Navigate to {url}, then run JS to check: + document.querySelector('meta[name=csrf-token]'), window.__CONFIG__, + window.__INITIAL_STATE__, window.ENV. We saw a key like '{observed_value}' + in the captured session — is it still the same or has it changed?" + - Routine uses: `{{meta:csrf-token}}` or `{{windowProperty:__CONFIG__.apiKey}}` + + **Source 3: Browser storage (localStorage / sessionStorage)** + Sites store tokens after their JS authenticates on page load. + - Experiment methodology: "Navigate to {url}, wait 3 seconds for JS to execute, + then dump sessionStorage and localStorage. Look for keys containing + 'token', 'auth', 'jwt', 'session'. In the capture we saw '{key_name}' + with value starting '{prefix}...'" + - Routine uses: `{{localStorage:auth.access_token}}` or + `{{sessionStorage:token.jwt}}` + + **Source 4: Cookies** + Some sites use cookie-based auth — navigation establishes the session. + - Experiment methodology: "Navigate to {url}, then try calling {data_endpoint} + with credentials:'include'. If it works, auth is cookie-based and the + routine just needs navigate + fetch with credentials:'include'. If it + fails, dump cookies with get_cookies to see what exists." + - Routine uses: `credentials: "include"` or `{{cookie:XSRF-TOKEN}}` + + **Source 5: Window properties (JS globals)** + Sites set global variables with config and auth. + - Experiment methodology: "Navigate to {url}, run JS to check window.__CONFIG__, + window.__INITIAL_STATE__, window.ENV, window.__NEXT_DATA__" + - Routine uses: `{{windowProperty:__CONFIG__.apiKey}}` + + **Source 6: JS evaluation (compute from page state)** + When tokens are derived/computed by the site's JS and stored in non-obvious places. + - Experiment methodology: "Navigate to {url}, wait for page load. The site's JS + likely stores auth state somewhere. Try: JSON.parse(sessionStorage.getItem( + 'persist:root')).auth, or look through all sessionStorage keys for anything + containing 'token'. Extract the value and try using it." + - Routine uses: js_evaluate operation to extract + store in sessionStorage + + **CRITICAL: When dispatching auth experiments, ALWAYS include:** + 1. The observed token/key value (or first 20 chars) from the captured session + 2. Where you found it in captures (which header, which response field) + 3. Whether it appears static (same across captures) or dynamic (different each time) + 4. Multiple strategies to try — "First try X, if that fails try Y, then Z" + + ## Hardcoding Site-Level Credentials — CRITICAL + + Many sites use API keys, subscription keys, or client IDs that are NOT user + secrets — they are site-wide constants baked into the website's JavaScript, + HTML meta tags, or network requests. Examples: + - Ocp-Apim-Subscription-Key + - x-api-key / apiKey / client_id + - Firebase API keys + - Public OAuth client IDs + + These MUST be resolved from captures (network headers, DOM, storage) and + HARDCODED directly into the routine. They must NEVER be exposed as user + parameters — no user would know where to find them. + + **Resolution order for static keys:** + 1. Network captures: check request headers from get_recorded_transaction + 2. DOM: check inline scripts, meta tags, window.* config objects + 3. Storage: check localStorage/sessionStorage for cached keys + 4. If found in captures, hardcode the value directly in routine headers/body + + **JWT/Bearer tokens are DIFFERENT** — they expire and must be fetched at + runtime via a fetch operation within the routine. But the API key USED TO + fetch the token should itself be hardcoded. + + **When building routines:** only parameterize values that a USER would + naturally provide (search terms, dates, IDs, locations). Everything else + should be hardcoded from captures. + + ## Parallel Experiments — ALWAYS PREFER BATCH (within dependency order) + + ALWAYS use dispatch_experiments_batch instead of dispatch_experiment when you + have 2+ INDEPENDENT experiments to run. This runs them IN PARALLEL on separate + workers — N experiments complete in the time of 1. + + But NEVER batch experiments that have unresolved dependencies on each other. + Auth must be solved before data endpoints. Reference data (e.g. station lists) + should be solved before parameterized endpoints that depend on those IDs. + + dispatch_experiment (singular) should ONLY be used for one-off experiments. + For all new experiments, batch them with dispatch_experiments_batch. + + Batch aggressively: + - After plan_routines, immediately batch experiments for all priority-1 routines + - When testing multiple API endpoints, batch them all at once + - When probing auth + multiple data endpoints, batch everything together + + ## Routine Naming & Documentation Standards + + These routines will be VECTORIZED and stored in databases for other agents to + discover via semantic search. Poor names and vague descriptions make routines + invisible and unusable. Follow these rules strictly: + + **Routine name** — snake_case, verb_noun pattern, 3+ segments, MUST include site context: + The name must make sense in isolation — another agent reading ONLY the name + should know what site/service this targets and what it does. Include a short + site identifier as a prefix or qualifier. + + GOOD: get_premierleague_standings, search_premierleague_matches_by_season, + fetch_amtrak_train_schedules, download_arxiv_paper_pdf, + list_espn_upcoming_fixtures, get_github_repo_stars + BAD: get_standings (standings from where?), get_content_item (what content? + what site?), fetch_data (completely generic), search_matches (which sport? + which site?), get_league_standings (which league? which site?) + + **Routine description** — ≥8 words, must explain: + 1. What it does (the action) + 2. What inputs it accepts (parameters) + 3. What data it returns (the output) + GOOD: "Fetches Premier League standings for a given competition ID and + season ID, returning team names, positions, wins, draws, losses, + goals scored, goals conceded, and total points." + BAD: "Get standings" (too short, no input/output info) + BAD: "A routine for the Premier League" (doesn't say what it does or returns) + + **Parameter names** — snake_case, descriptive: + GOOD: competition_id, season_year, team_name, departure_date + BAD: id (ambiguous), param1 (meaningless), x (cryptic) + + **Parameter descriptions** — ≥3 words, explain what the value represents: + GOOD: "The unique competition identifier (e.g. 1 for Premier League)" + GOOD: "Season year in YYYY format (e.g. 2024)" + BAD: "ID" (too terse) + BAD: "The season" (doesn't explain format or expected values) + + **Non-obvious parameter sourcing** — CRITICAL for opaque IDs and codes: + If a parameter is NOT something a human would naturally know (e.g. an internal + numeric ID, a slug, an encoded token, a UUID), the description MUST explain + WHERE to get that value. The user calling this routine has no idea what + "competition_id: 1" means unless you tell them how to find it. + + GOOD: "Internal competition ID. Obtain from the get_competitions routine or + the /competitions API endpoint. Example: 1 = Premier League, 2 = Championship." + GOOD: "Season ID as used by the Premier League API. Use the get_seasons routine + to list valid season IDs for a competition. Example: 418 = 2023-24 season." + GOOD: "Team slug as it appears in the site URL path (e.g. 'arsenal', 'manchester-united'). + Find by calling get_teams or navigating to the team page." + BAD: "The competition ID" (where do I get it?) + BAD: "Season identifier" (what values are valid? how do I look them up?) + + Rule of thumb: if you can't google the value, the description must say how to get it. + + ## CRITICAL RULES + + - NEVER guess at request details. Always dispatch experiments to verify. + - Write experiment methodologies that reference worker tools by name. + - Record a verdict for EVERY completed experiment via record_finding. + - Always include reusable takeaways in record_finding so future workers + receive concrete lessons (claim + how_to_apply_next + evidence). + - If an experiment is ambiguous, dispatch a targeted follow-up experiment with more specific methodology. + - ALWAYS provide test_parameters when calling submit_routine — the routine + WILL be executed and inspected. Use realistic values the experiments proved work. + If the routine has 0 parameters, pass test_parameters: {} + - DEPENDENCY ORDER IS SACRED: auth → reference data → data endpoints → assembly. + NEVER dispatch data endpoint experiments until auth is CONFIRMED working. + NEVER give up on data endpoints just because they returned 401 — that means + you need to solve auth first, not that the endpoint is broken. + - Workers do NOT share browser state. When an endpoint requires auth, your + experiment methodology must include FULL auth instructions (token URL, headers, + subscription key) so the worker can authenticate within its own session. + - mark_routine_failed is globally gated: you cannot fail any routine until at + least 5 routine attempts have failed across the pipeline. Keep iterating and + submitting improved routines before giving up on individual specs. + + ## Resilience — NEVER Give Up Early + + - NEVER call mark_failed after fewer than 5 experiments per routine. + CORS failures, 400 errors, and network issues are NORMAL obstacles, not + reasons to quit. They mean you need a different approach, not that the + pipeline is hopeless. + - When a fetch fails (CORS, 400, timeout), iterate with alternative approaches: + 1. Use search_recorded_transactions / get_recorded_transaction to see the EXACT + request headers and patterns that worked in the recorded session, then + replicate them in the worker's experiment. + 2. Use browser_cdp_command with Fetch.enable to intercept requests at the CDP + level — this bypasses CORS entirely since it operates below the browser + security layer. + 3. Try navigating directly to the API URL with browser_navigate — GET requests + via top-level navigation don't have CORS restrictions. + 4. Try fetch with mode: 'no-cors' or from a different origin context. + 5. Check if the site's JS uses a proxy path (e.g. /api/* proxied to the API + domain) — search the captured network data for path patterns. + - If ALL alternative approaches fail for a routine, mark_routine_failed for THAT + routine and move on to the next one. Do NOT call mark_failed (pipeline-level) + unless every single routine has been individually addressed. + - When results are unclear, dispatch a focused follow-up experiment rather than + guessing — experiments are cheap compared to shipping a broken routine. + + ## Common Execution Failures — MUST READ + + ### TypeError: Failed to fetch (CORS) + If a routine's fetch operation fails with "TypeError: Failed to fetch", this + almost always means the browser's current origin doesn't match the API's + CORS Access-Control-Allow-Origin header. Routines start from about:blank + (origin = null), so ANY cross-origin fetch will fail without navigation. + + The fix is to add a `navigate` operation BEFORE the first fetch to set the + browser origin to the allowed domain. + + Example: If the API is at https://api.example.com but CORS only allows + https://www.example.com, the routine MUST start with: + {"type": "navigate", "url": "https://www.example.com"} + before any fetch to https://api.example.com/... + + RULE: Every routine that calls an external API MUST start with a navigate + operation. This is cheap (one page load) and prevents CORS issues. If you + see "Failed to fetch" in an inspection blocking issue, ADD A NAVIGATE OP. + + For more details: search_files(scope="docs", query="cors-failed-to-fetch", mode="exact") + + ### HTTP 401/403 (Authentication) + If a fetch returns 401/403, the routine is missing authentication. Check + experiment findings for auth token endpoints and subscription keys. The + routine must obtain a token (via fetch + js_evaluate) before calling + protected endpoints. For more details: search_files(scope="docs", query="unauthenticated", mode="exact") + """) + + # ----------------------------------------------------------------------- + # Constructor + # ----------------------------------------------------------------------- + + def __init__( + self, + emit_message_callable: Callable[[EmittedMessage], None], + task: str, + # Exploration summaries — injected into system prompt + exploration_summaries: dict[str, str] | None = None, + # Data loaders — passed through to workers + network_data_loader: NetworkDataLoader | None = None, + storage_data_loader: StorageDataLoader | None = None, + dom_data_loader: DOMDataLoader | None = None, + window_property_data_loader: WindowPropertyDataLoader | None = None, + documentation_data_loader: DocumentationDataLoader | None = None, + # Browser context — passed through to workers + remote_debugging_address: str | None = None, + # Resume support — pass an existing ledger to pick up where a previous PI left off + ledger: DiscoveryLedger | None = None, + # LLM config + llm_model: LLMModel = OpenAIModel.GPT_5_1, + worker_llm_model: LLMModel | None = None, + max_iterations: int = 200, + worker_max_loops: int = 30, + max_attempts_per_routine: int = 5, + min_experiments_before_fail: int = 10, + min_global_failed_attempts_before_routine_failure: int = 5, + # Agent pool sizes + num_workers: int = 3, + num_inspectors: int = 1, + # Persistence callbacks + on_ledger_change: Callable[[DiscoveryLedger, str], None] | None = None, + on_agent_thread: Callable[[str, str, list[dict[str, Any]]], None] | None = None, + on_attempt_record: Callable[[dict[str, Any]], None] | None = None, + # Standard agent args + persist_chat_callable: Callable[[Chat], Chat] | None = None, + persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None, + stream_chunk_callable: Callable[[str], None] | None = None, + chat_thread: ChatThread | None = None, + existing_chats: list[Chat] | None = None, + workspace: AgentWorkspace | None = None, + worker_workspace_factory: Callable[[], AgentWorkspace] | None = None, + inspector_workspace_factory: Callable[[], AgentWorkspace] | None = None, + ) -> None: + # Task + self._task = task + self._max_iterations = max_iterations + self._worker_max_loops = worker_max_loops + self._max_attempts_per_routine = max_attempts_per_routine + self._min_experiments_before_fail = min_experiments_before_fail + self._min_global_failed_attempts_before_routine_failure = max( + 0, + int(min_global_failed_attempts_before_routine_failure), + ) + + # Exploration context + raw_summaries = exploration_summaries or {} + self._exploration_summaries_raw = dict(raw_summaries) + toonified_summaries: dict[str, str] = {} + for domain, summary in raw_summaries.items(): + if not isinstance(summary, str): + toonified_summaries[domain] = toon_encode(summary) + continue + + stripped = summary.strip() + if not stripped: + toonified_summaries[domain] = summary + continue + + try: + parsed = json.loads(stripped) + except json.JSONDecodeError: + toonified_summaries[domain] = summary + continue + + toonified_summaries[domain] = toon_encode(parsed) + self._exploration_summaries = toonified_summaries + + # Data loaders (passed through to workers) + self._network_data_loader = network_data_loader + self._storage_data_loader = storage_data_loader + self._dom_data_loader = dom_data_loader + self._window_property_data_loader = window_property_data_loader + self._documentation_data_loader = documentation_data_loader + + # Browser context (passed through to workers) + self._remote_debugging_address = remote_debugging_address + + # LLM + self._worker_llm_model = worker_llm_model or llm_model + + # Agent pools + self._num_workers = num_workers + self._num_inspectors = num_inspectors + self._worker_counter = 0 # Round-robin counter for workers + self._inspector_counter = 0 # Round-robin counter for inspectors + + # Persistence callbacks + self._on_ledger_change = on_ledger_change + self._on_agent_thread = on_agent_thread + self._on_attempt_record = on_attempt_record + self._worker_workspace_factory = worker_workspace_factory + self._inspector_workspace_factory = inspector_workspace_factory + + # Internal state — the Discovery Ledger tracks everything + # Accept an existing ledger for resume after context exhaustion + self._ledger = ledger or DiscoveryLedger(user_task=task) + self._orchestration_state = AgentOrchestrationState() + self._agent_instances: dict[str, AbstractAgent] = {} + self._is_done = False + self._pipeline_result: RoutineCatalog | None = None + self._recent_tool_calls: list[str] = [] # Track recent tool names for loop detection + self._docs_reviewed: bool = False # Gate: must review docs before dispatching experiments + + super().__init__( + emit_message_callable=emit_message_callable, + workspace=workspace, + persist_chat_callable=persist_chat_callable, + persist_chat_thread_callable=persist_chat_thread_callable, + stream_chunk_callable=stream_chunk_callable, + llm_model=llm_model, + chat_thread=chat_thread, + existing_chats=existing_chats, + documentation_data_loader=documentation_data_loader, + allow_code_execution=True, + ) + + logger.debug( + "PrincipalInvestigator initialized: task=%s, explorations=%s", + task[:80], + list(self._exploration_summaries.keys()), + ) + + # ----------------------------------------------------------------------- + # System prompt + # ----------------------------------------------------------------------- + + def _get_system_prompt(self) -> str: + parts: list[str] = [self.SYSTEM_PROMPT_CORE] + if self.has_workspace: + try: + summary = self._require_workspace().generate_summary() + parts.append(f"\n\n## Workspace Summary\n{summary}") + except Exception as e: + logger.warning("Failed to generate PI workspace summary: %s", e) + + # Routine JSON schema — auto-generated from the Pydantic models + parts.append("\n## Routine JSON Schema\n\n") + parts.append("When calling submit_routine, the routine_json MUST conform to this schema.\n") + parts.append("Every operation needs a 'type' field as discriminator.\n\n") + parts.append(Routine.model_schema_markdown()) + parts.append("\n\n### Example routine_json (fetch a parameterized API)\n\n") + parts.append(dedent("""\ + ```json + { + "name": "get_premierleague_standings", + "description": "Fetches Premier League standings for a given competition and season, returning team names, positions, wins, draws, losses, goals scored, goals conceded, and total points.", + "parameters": [ + {"name": "competition_id", "type": "integer", "description": "Internal competition identifier. Obtain from the get_premierleague_competitions routine. Example: 1 = Premier League."}, + {"name": "season_id", "type": "integer", "description": "Season identifier as used by the Premier League API. Obtain from the get_premierleague_seasons routine. Example: 418 = 2023-24 season."} + ], + "operations": [ + { + "type": "navigate", + "url": "https://www.example.com" + }, + { + "type": "fetch", + "endpoint": { + "url": "https://api.example.com/competitions/{{competitionId}}/seasons/{{seasonId}}/standings", + "method": "GET" + }, + "session_storage_key": "standings_result" + }, + { + "type": "return", + "session_storage_key": "standings_result" + } + ] + } + ``` + """)) + + # Worker capabilities + parts.append(WORKER_CAPABILITIES) + parts.append(self._generate_code_execution_prompt()) + + # Exploration summaries + if self._exploration_summaries: + parts.append("\n## Exploration Summaries\n") + for domain, summary in self._exploration_summaries.items(): + parts.append(f"### {domain}\n{summary}\n") + + # Discovery Ledger + if self._ledger.routine_specs or self._ledger.experiments or self._ledger.attempts: + ledger_payload = self._ledger.model_dump( + mode="json", + exclude={ + "experiments": { + "__all__": { + "prompt": True, + "output": True, + } + }, + "attempts": { + "__all__": { + "routine_json": True, + "execution_result": True, + "inspection_result": True, + } + }, + }, + ) + parts.append("\n## Discovery Ledger\n") + parts.append(toon_encode(ledger_payload)) + + # Task queue status + queue = self._orchestration_state.get_queue_status() + if any(v > 0 for v in queue.values()): + parts.append("\n## Task Queue\n") + parts.append(toon_encode(queue)) + + return "".join(parts) + + # ----------------------------------------------------------------------- + # Public entry point + # ----------------------------------------------------------------------- + + def run(self) -> RoutineCatalog | None: + """ + Run the PI loop to completion. + + Returns: + A RoutineCatalog of shipped routines, or None if construction failed. + """ + # Seed the conversation — detect resume vs fresh start + is_resume = bool(self._ledger.experiments or self._ledger.routine_specs) + + if is_resume: + initial_message = ( + f"TASK: {self._task}\n\n" + "You are RESUMING a previous session that ran out of context. " + "Your Discovery Ledger has been preserved with all prior work.\n\n" + "FIRST: Call get_ledger to see exactly where things stand — " + "what routines are planned, what experiments have been run, " + "what's shipped, and what still needs work.\n\n" + "Then pick up where the previous session left off. Do NOT repeat " + "experiments that already have verdicts." + ) + logger.info( + "PI resuming: %d specs, %d experiments, %d attempts", + len(self._ledger.routine_specs), + len(self._ledger.experiments), + len(self._ledger.attempts), + ) + else: + initial_message = ( + f"TASK: {self._task}\n\n" + "MANDATORY FIRST STEP: Review the routine documentation before doing anything else.\n" + "Call search_files(scope='docs', query='operation', mode='exact') and read_file(scope='docs', path='...') on the Routine/operation model files\n" + "to understand ALL operation types (fetch, click, input_text, js_evaluate, download, etc.).\n" + "You CANNOT dispatch experiments until you understand the full routine capabilities.\n\n" + "After reviewing docs:\n" + "1. Analyze the exploration summaries\n" + "2. Call plan_routines to declare what routines to build\n" + "3. Use dispatch_experiments_batch to test ALL priority-1 routines IN PARALLEL\n" + "4. Record findings for each, then batch the next round\n" + "5. Build and submit_routine for each proven routine\n" + "6. Call mark_complete when all routines are shipped or failed\n\n" + "IMPORTANT: Always use dispatch_experiments_batch (not dispatch_experiment) " + "to run multiple experiments in parallel. This is much faster." + ) + self._add_chat(ChatRole.USER, initial_message) + + for iteration in range(self._max_iterations): + if self._is_done: + logger.info("PI completed after %d iterations", iteration) + self._dump_agent_thread("principal_investigator", self) + return self._pipeline_result + + messages = self._build_messages_for_llm() + response = self._call_llm( + messages, + self._get_system_prompt(), + tool_choice="required", + ) + + # Add assistant response to chat + self._add_chat( + ChatRole.ASSISTANT, + content=response.content or "", + tool_calls=response.tool_calls, + llm_provider_response_id=response.response_id, + ) + + # Persist PI thread after every iteration + self._dump_agent_thread("principal_investigator", self) + + if response.tool_calls: + self._process_tool_calls(response.tool_calls) + + # Track docs review — any docs tool call satisfies the gate + _DOCS_TOOLS = {"search_files", "read_file", "list_files"} + for tc in response.tool_calls: + if tc.tool_name in _DOCS_TOOLS: + scope = (tc.tool_arguments or {}).get("scope") + if scope == "docs": + self._docs_reviewed = True + + # Loop detection: track recent tool calls (name + whether it errored) + for tc in response.tool_calls: + self._recent_tool_calls.append(tc.tool_name) + # Keep only last 6 + self._recent_tool_calls = self._recent_tool_calls[-6:] + + # Detect stuck patterns: + # 1. Same tool called 3+ times in a row (any tool) + # 2. Alternating pair (e.g. mark_routine_failed + get_ledger) + _is_stuck = False + _stuck_msg = "" + + if len(self._recent_tool_calls) >= 3: + last3 = self._recent_tool_calls[-3:] + # Same tool 3x in a row + if len(set(last3)) == 1: + _is_stuck = True + _stuck_msg = f"calling {last3[0]} repeatedly" + + if not _is_stuck and len(self._recent_tool_calls) >= 4: + last4 = self._recent_tool_calls[-4:] + # Alternating pair: A B A B + if last4[0] == last4[2] and last4[1] == last4[3] and last4[0] != last4[1]: + _is_stuck = True + _stuck_msg = f"alternating between {last4[0]} and {last4[1]}" + + if _is_stuck: + self._recent_tool_calls.clear() + shipped = sum(1 for s in self._ledger.routine_specs if s.status == RoutineSpecStatus.SHIPPED) + failed = sum(1 for s in self._ledger.routine_specs if s.status == RoutineSpecStatus.FAILED) + unaddressed = [ + s for s in self._ledger.routine_specs + if s.status not in (RoutineSpecStatus.SHIPPED, RoutineSpecStatus.FAILED) + ] + unaddressed_names = [f"{s.name} ({s.status.value})" for s in unaddressed] + self._add_chat( + ChatRole.USER, + f"STOP — you are stuck {_stuck_msg}. This is wasting iterations.\n\n" + f"Current progress: {shipped} shipped, {failed} failed, " + f"{len(unaddressed)} unaddressed.\n" + f"Unaddressed routines: {', '.join(unaddressed_names) or 'none'}\n\n" + "If a tool keeps returning an error, do NOT retry the same call. " + "Read the error message and change your approach.\n\n" + "To make progress you MUST do ONE of these:\n" + "1. dispatch_experiments_batch for unaddressed routines\n" + "2. submit_routine with VALID routine_json AND test_parameters " + "(if routine has 0 params, pass test_parameters: {})\n" + "3. mark_routine_failed for routines that truly can't work\n" + "4. mark_complete if all routines are shipped or failed\n\n" + "Pick ONE action for a DIFFERENT routine than the one you're stuck on.", + ) + else: + # Nudge the PI to act + self._add_chat( + ChatRole.USER, + "You must use a tool. Dispatch an experiment, record a finding, " + "submit a routine, or mark_complete if done.", + ) + + logger.warning("PI exhausted %d iterations without completing", self._max_iterations) + # Dump PI thread before returning partial results + self._dump_agent_thread("principal_investigator", self) + # Return whatever we've shipped so far + return self._build_partial_catalog() + + # =================================================================== + # Persistence — notify external listener after ledger mutations + # =================================================================== + + def _persist(self, reason: str) -> None: + """Fire the on_ledger_change callback if registered.""" + if self._on_ledger_change is not None: + try: + self._on_ledger_change(self._ledger, reason) + except Exception as e: + logger.warning("on_ledger_change callback failed: %s", e) + + def _record_attempt( + self, + spec: RoutineSpec, + attempt: RoutineAttempt, + routine_json: dict[str, Any], + test_parameters: dict[str, Any], + execution_result: RoutineExecutionResultWithMetadata | None, + inspection_result: dict[str, Any] | None, + ) -> None: + """Persist a unified attempt record via the on_attempt_record callback.""" + if self._on_attempt_record is None: + return + try: + # Count which attempt number this is for the spec + spec_attempts = self._ledger.get_attempts_for_spec(spec.id) + attempt_number = len(spec_attempts) + + record: dict[str, Any] = { + "attempt_id": attempt.id, + "spec_id": spec.id, + "spec_name": spec.name, + "spec_description": spec.description, + "attempt_number": attempt_number, + "timestamp": str(datetime.now()), + "verdict": attempt.status.value, + "routine_json": routine_json, + "test_parameters": test_parameters, + "execution_result": ( + execution_result.model_dump() if execution_result is not None else None + ), + "inspection_result": inspection_result, + } + self._on_attempt_record(record) + except Exception as e: + logger.warning("on_attempt_record callback failed: %s", e) + + def _dump_agent_thread(self, agent_label: str, agent: AbstractAgent) -> None: + """Dump an agent's full message history via the on_agent_thread callback.""" + if self._on_agent_thread is None: + return + try: + chats = agent.get_chats() + messages = [] + for c in chats: + try: + msg = { + "id": c.id, + "role": c.role.value if c.role else "unknown", + "content": c.content or "", + "tool_calls": [ + {"tool_name": tc.tool_name, "arguments": tc.tool_arguments, "call_id": tc.call_id} + for tc in (c.tool_calls or []) + ], + "tool_call_id": c.tool_call_id, + "created_at": str(c.created_at) if c.created_at else None, + } + messages.append(msg) + except Exception as chat_err: + logger.warning("Failed to serialize chat %s for %s: %s", c.id, agent_label, chat_err) + messages.append({"id": c.id, "role": "unknown", "content": f"[serialization error: {chat_err}]"}) + thread_id = agent.get_thread().id if agent.get_thread() else "unknown" + self._on_agent_thread(agent_label, thread_id, messages) + except Exception as e: + logger.warning("on_agent_thread callback failed for %s: %s", agent_label, e, exc_info=True) + + # =================================================================== + # CATALOG PLANNING TOOLS + # =================================================================== + + @agent_tool() + def _plan_routines( + self, + specs: list[dict[str, Any]], + ) -> dict[str, Any]: + """ + Declare what routines to build from the exploration data. + + Call this early after analyzing exploration summaries. Each spec + represents a distinct capability to extract from the site. + Can be called again to add new specs discovered during experimentation. + + Args: + specs: List of routine specs. Each dict has: + - name: Short name (e.g. "get_league_standings") + - description: What the routine does + - priority: 1=must-have, 2=should-have, 3=nice-to-have (default 1) + """ + created_ids: list[str] = [] + for spec_dict in specs: + spec = RoutineSpec( + name=spec_dict.get("name") or spec_dict.get("id", "unnamed"), + description=spec_dict.get("description", ""), + priority=spec_dict.get("priority", 1), + ) + self._ledger.add_spec(spec) + created_ids.append(spec.id) + + # Auto-set the first one as active if none is active + if self._ledger.active_spec_id is None and self._ledger.routine_specs: + self._ledger.active_spec_id = self._ledger.routine_specs[0].id + + self._persist("plan_routines") + return { + "created": len(created_ids), + "spec_ids": created_ids, + "total_specs": len(self._ledger.routine_specs), + } + + @agent_tool() + def _set_active_routine(self, spec_id: str) -> dict[str, Any]: + """ + Switch focus to a different routine. The PI works on one routine at a + time but can switch when blocked or when dependencies are shared. + + Args: + spec_id: ID of the RoutineSpec to focus on. + """ + spec = self._ledger.get_spec(spec_id) + if spec is None: + return {"error": f"No spec found with ID: {spec_id}"} + + self._ledger.active_spec_id = spec_id + return {"active": spec.name, "status": spec.status.value} + + # =================================================================== + # EXPERIMENT TOOLS + # =================================================================== + + def _truncate_text_for_briefing(self, value: Any, max_chars: int = 220) -> str: + """Normalize arbitrary values to a compact, single-line string.""" + text = value if isinstance(value, str) else json.dumps(value, default=str) + text = text.replace("\n", " ").strip() + if len(text) > max_chars: + return text[:max_chars] + "..." + return text + + def _build_worker_briefing(self, routine_spec_id: str | None) -> str: + """ + Build compact reusable context for workers from ledger state. + + Includes proven artifacts, reusable experiment takeaways, and known blockers + from latest failed routine attempts. + """ + lines: list[str] = [ + "## Worker Briefing (Reusable Context)", + "Use this as prior context. Verify assumptions against live/browser evidence.", + ] + + # Proven artifacts (compact, capped) + proven_lines: list[str] = [] + for fetch in self._ledger.proven.fetches[-5:]: + method = self._truncate_text_for_briefing(fetch.get("method", "?"), 24) + url = self._truncate_text_for_briefing(fetch.get("url", "?"), 140) + proven_lines.append(f"- FETCH: {method} {url}") + for nav in self._ledger.proven.navigations[-4:]: + url = self._truncate_text_for_briefing(nav.get("url", "?"), 160) + proven_lines.append(f"- NAV: {url}") + for token in self._ledger.proven.tokens[-4:]: + name = self._truncate_text_for_briefing(token.get("name", "?"), 40) + source = self._truncate_text_for_briefing(token.get("source", "?"), 120) + proven_lines.append(f"- TOKEN: {name} (source: {source})") + for param in self._ledger.proven.parameters[-5:]: + name = self._truncate_text_for_briefing(param.get("name", "?"), 40) + ptype = self._truncate_text_for_briefing(param.get("type", "?"), 24) + example = self._truncate_text_for_briefing(param.get("example_value", ""), 90) + proven_lines.append(f"- PARAM: {name} ({ptype}) example={example}") + if proven_lines: + lines.append("\n### Proven Artifacts") + lines.extend(proven_lines[:12]) + + # Reusable takeaways from confirmed/partial experiments + relevant_takeaways: list[tuple[ExperimentEntry, ExperimentTakeaway]] = [] + for exp in reversed(self._ledger.experiments): + if exp.verdict not in {ExperimentVerdict.CONFIRMED, ExperimentVerdict.PARTIAL}: + continue + if exp.routine_spec_id is not None and routine_spec_id is not None: + if exp.routine_spec_id != routine_spec_id: + continue + elif exp.routine_spec_id is not None and routine_spec_id is None: + # Shared experiments should not inherit routine-specific assumptions by default. + continue + + for takeaway in exp.takeaways: + relevant_takeaways.append((exp, takeaway)) + if len(relevant_takeaways) >= 8: + break + if len(relevant_takeaways) >= 8: + break + + if relevant_takeaways: + lines.append("\n### Prior Experiment Takeaways") + for exp, takeaway in relevant_takeaways: + claim = self._truncate_text_for_briefing(takeaway.claim, 190) + tags = ", ".join(takeaway.tags[:4]) if takeaway.tags else "" + prefix = f"[{exp.id}]" + if tags: + prefix += f" [{tags}]" + lines.append(f"- {prefix} {claim}") + if takeaway.how_to_apply_next: + how = self._truncate_text_for_briefing(takeaway.how_to_apply_next, 180) + lines.append(f"- apply: {how}") + if takeaway.evidence: + ev = self._truncate_text_for_briefing(takeaway.evidence, 180) + lines.append(f"- evidence: {ev}") + + # Latest blockers for this routine spec + if routine_spec_id: + attempts = self._ledger.get_attempts_for_spec(routine_spec_id) + latest_failed = next( + (attempt for attempt in reversed(attempts) if attempt.status == RoutineAttemptStatus.FAILED), + None, + ) + if latest_failed and latest_failed.blocking_issues: + lines.append("\n### Known Blockers (Latest Failed Attempt)") + for issue in latest_failed.blocking_issues[:4]: + lines.append(f"- {self._truncate_text_for_briefing(issue, 220)}") + + if len(lines) <= 2: + return "" + return "\n".join(lines) + + def _compose_worker_methodology(self, methodology: str, routine_spec_id: str | None) -> str: + """Compose final worker task methodology with briefing + assigned experiment.""" + briefing = self._build_worker_briefing(routine_spec_id) + if not briefing: + return methodology + return f"{briefing}\n\n## Assigned Experiment Methodology\n{methodology}" + + @agent_tool(token_optimized=True) + def _dispatch_experiment( + self, + hypothesis: str, + rationale: str, + methodology: str, + output_description: str | None = None, + ) -> dict[str, Any]: + """ + Create and dispatch an experiment to a worker. + + The worker has browser tools and capture lookup tools. Write the methodology + so the worker knows exactly what to do — reference tools by name. + + Args: + hypothesis: What we're testing. Specific and falsifiable. + rationale: WHY we're testing this — evidence, reasoning, expectations. + methodology: Instructions for the worker. Reference worker tools by name. + output_description: Description of expected output. + """ + # Gate: must review docs first + if not self._docs_reviewed and self._documentation_data_loader is not None: + return { + "error": ( + "You must review the routine documentation BEFORE dispatching experiments. " + "Call search_files(scope='docs', query='operation', mode='exact') and read_file(scope='docs', path='...') to understand all available " + "operation types (fetch, click, input_text, js_evaluate, get_cookies, " + "download, etc.). This ensures you design experiments that leverage the " + "full routine capabilities." + ) + } + + worker_methodology = self._compose_worker_methodology( + methodology=methodology, + routine_spec_id=None, + ) + resolved_output_description = output_description or "Structured experiment findings." + + # Create experiment entry + experiment = ExperimentEntry( + hypothesis=hypothesis, + rationale=rationale, + methodology=worker_methodology, + routine_spec_id=None, + status=ExperimentStatus.RUNNING, + ) + self._ledger.add_experiment(experiment) + + # Create and dispatch task + task = Task( + agent_type=SpecialistAgentType.EXPERIMENT_WORKER, + prompt=worker_methodology, + max_loops=self._worker_max_loops, + output_schema=self.DEFAULT_WORKER_OUTPUT_SCHEMA, + output_description=resolved_output_description, + ) + self._orchestration_state.add_task(task) + experiment.task_id = task.id + + # Execute immediately + result = self._execute_task(task) + + # Update experiment status from task + if task.status == TaskStatus.COMPLETED: + experiment.status = ExperimentStatus.DONE + experiment.output = task.result + elif task.status == TaskStatus.FAILED: + experiment.status = ExperimentStatus.FAILED + experiment.output = {"error": task.error} + elif task.status == TaskStatus.PAUSED: + experiment.status = ExperimentStatus.RUNNING + + self._persist(f"experiment_{experiment.id}") + return { + "experiment_id": experiment.id, + "task_id": task.id, + "status": experiment.status.value, + "result": result, + } + + @agent_tool(token_optimized=True) + def _dispatch_experiments_batch( + self, + experiments: list[dict[str, Any]], + ) -> dict[str, Any]: + """ + Dispatch multiple experiments IN PARALLEL to separate workers. + + Each experiment runs on its own worker with its own browser tab, + all executing concurrently. Use this when you have independent + experiments that don't depend on each other's results. + + Much faster than calling dispatch_experiment sequentially — N experiments + run in roughly the time of 1. + + Args: + experiments: List of experiment dicts, each with: + - hypothesis: What we're testing (specific and falsifiable) + - rationale: WHY we're testing this + - methodology: Instructions for the worker (reference tools by name!) + - output_description: (optional) Description of expected output + """ + if not experiments: + return {"error": "No experiments provided"} + + # Gate: must review docs first + if not self._docs_reviewed and self._documentation_data_loader is not None: + return { + "error": ( + "You must review the routine documentation BEFORE dispatching experiments. " + "Call search_files(scope='docs', query='operation', mode='exact') and read_file(scope='docs', path='...') to understand all available " + "operation types (fetch, click, input_text, js_evaluate, get_cookies, " + "download, etc.). This ensures you design experiments that leverage the " + "full routine capabilities." + ) + } + + # Cap at num_workers to avoid overwhelming the system + max_parallel = self._num_workers + if len(experiments) > max_parallel: + logger.warning( + "Batch of %d experiments exceeds worker pool (%d), running first %d", + len(experiments), max_parallel, max_parallel, + ) + experiments = experiments[:max_parallel] + + # Phase 1: Create all experiment entries and tasks (sequential — fast, no I/O) + task_experiment_pairs: list[tuple[Task, ExperimentEntry]] = [] + for exp_dict in experiments: + worker_methodology = self._compose_worker_methodology( + methodology=exp_dict.get("methodology", ""), + routine_spec_id=None, + ) + resolved_output_description = ( + exp_dict.get("output_description") or "Structured experiment findings." + ) + + experiment = ExperimentEntry( + hypothesis=exp_dict.get("hypothesis", ""), + rationale=exp_dict.get("rationale", ""), + methodology=worker_methodology, + routine_spec_id=None, + status=ExperimentStatus.RUNNING, + ) + self._ledger.add_experiment(experiment) + + task = Task( + agent_type=SpecialistAgentType.EXPERIMENT_WORKER, + prompt=worker_methodology, + max_loops=self._worker_max_loops, + output_schema=self.DEFAULT_WORKER_OUTPUT_SCHEMA, + output_description=resolved_output_description, + ) + self._orchestration_state.add_task(task) + experiment.task_id = task.id + task_experiment_pairs.append((task, experiment)) + + self._persist("batch_dispatched") + + # Phase 2: Execute all tasks in parallel using ThreadPoolExecutor + results: list[dict[str, Any]] = [] + + def _run_one(pair: tuple[Task, ExperimentEntry]) -> dict[str, Any]: + task, experiment = pair + # Create a dedicated worker for this parallel task + worker = self._create_worker() + subagent = SubAgent( + type=task.agent_type, + llm_model=self._worker_llm_model.value, + ) + self._orchestration_state.subagents[subagent.id] = subagent + self._agent_instances[subagent.id] = worker + task.agent_id = subagent.id + subagent.task_ids.append(task.id) + + # Wire up real-time thread persistence + agent_label = f"worker_{subagent.id}" + worker._on_chat_added = lambda _chat: self._dump_agent_thread(agent_label, worker) + + try: + task.status = TaskStatus.IN_PROGRESS + task.started_at = datetime.now() + + config = AutonomousRunConfig( + min_iterations=1, + max_iterations=task.max_loops, + ) + result = worker.run_autonomous( + task=task.prompt, + config=config, + output_schema=task.output_schema, + output_description=task.output_description, + ) + task.loops_used += worker.autonomous_iteration + self._dump_agent_thread(f"worker_{subagent.id}", worker) + + if result is not None: + task.status = TaskStatus.COMPLETED + task.completed_at = datetime.now() + task.result = result.model_dump() if isinstance(result, BaseModel) else result + experiment.status = ExperimentStatus.DONE + experiment.output = task.result + else: + task.status = TaskStatus.FAILED + task.error = "Max loops reached without result" + experiment.status = ExperimentStatus.FAILED + experiment.output = {"error": task.error} + except Exception as e: + task.status = TaskStatus.FAILED + task.error = str(e) + task.completed_at = datetime.now() + experiment.status = ExperimentStatus.FAILED + experiment.output = {"error": str(e)} + logger.error("Parallel task %s failed: %s", task.id, e) + finally: + worker.close() + + return { + "experiment_id": experiment.id, + "hypothesis": experiment.hypothesis[:100], + "status": experiment.status.value, + "result_preview": str(experiment.output)[:300] if experiment.output else None, + } + + with ThreadPoolExecutor(max_workers=max_parallel) as pool: + futures = { + pool.submit(_run_one, pair): pair + for pair in task_experiment_pairs + } + for future in as_completed(futures, timeout=self.WORKER_TIMEOUT_SECONDS + 30): + pair = futures[future] + try: + results.append(future.result(timeout=self.WORKER_TIMEOUT_SECONDS)) + except FuturesTimeoutError: + _, experiment = pair + logger.error( + "Batch experiment %s timed out after %ds", + experiment.id, self.WORKER_TIMEOUT_SECONDS, + ) + experiment.status = ExperimentStatus.FAILED + experiment.output = {"error": f"Worker timed out after {self.WORKER_TIMEOUT_SECONDS}s"} + results.append({ + "experiment_id": experiment.id, + "status": "failed", + "error": f"Worker timed out after {self.WORKER_TIMEOUT_SECONDS}s", + }) + except Exception as e: + logger.error("Batch experiment failed: %s", e) + results.append({ + "experiment_id": pair[1].id, + "status": "failed", + "error": str(e), + }) + + self._persist("batch_completed") + + completed = sum(1 for r in results if r.get("status") == "done") + failed = sum(1 for r in results if r.get("status") == "failed") + + return { + "total": len(results), + "completed": completed, + "failed": failed, + "experiments": results, + } + + def _get_remediation_docs_for_experiment( + self, experiment: ExperimentEntry, + ) -> str | None: + """Scan experiment output for known error patterns and return relevant doc content.""" + if self._documentation_data_loader is None: + return None + + # Build searchable text from output + summary + output_text = json.dumps(experiment.output, default=str).lower() if experiment.output else "" + summary_text = (experiment.summary or "").lower() + searchable = output_text + " " + summary_text + + matched_paths: list[str] = [] + for keywords, doc_path in self._ERROR_DOC_PATTERNS: + if any(kw in searchable for kw in keywords): + matched_paths.append(doc_path) + + if not matched_paths: + return None + + doc_sections: list[str] = [] + for doc_path in matched_paths: + content = self._documentation_data_loader.get_file_content(doc_path) + if content: + doc_sections.append(f"--- {doc_path} ---\n{content}") + + if not doc_sections: + return None + + return ( + "IMPORTANT: The following documentation covers known fixes for the " + "errors observed in this experiment. Read carefully before deciding " + "to give up on this routine spec.\n\n" + "\n\n".join(doc_sections) + ) + + @agent_tool(token_optimized=True) + def _get_experiment_result(self, experiment_id: str) -> dict[str, Any]: + """ + Read the result of a completed experiment. + + Args: + experiment_id: ID of the experiment. + """ + experiment = self._ledger.get_experiment(experiment_id) + if experiment is None: + return {"error": f"No experiment found with ID: {experiment_id}"} + + result: dict[str, Any] = { + "experiment_id": experiment.id, + "hypothesis": experiment.hypothesis, + "routine_spec_id": experiment.routine_spec_id, + "status": experiment.status.value, + "verdict": experiment.verdict.value if experiment.verdict else None, + "summary": experiment.summary, + "takeaways": [t.model_dump(mode="json") for t in experiment.takeaways], + "output": experiment.output, + } + + # Auto-inject relevant common-issues docs when experiment has errors + remediation_docs = self._get_remediation_docs_for_experiment(experiment) + if remediation_docs: + result["remediation_docs"] = remediation_docs + + return result + + + # =================================================================== + # RECORDING TOOLS + # =================================================================== + + @agent_tool() + def _record_finding( + self, + experiment_id: str, + verdict: str, + summary: str, + takeaways: list[dict[str, Any]] | None = None, + ) -> dict[str, Any]: + """ + Record a verdict after reviewing an experiment result. + + MUST be called for every completed experiment. This builds the + experiment log that drives your next decisions. + + Args: + experiment_id: ID of the experiment. + verdict: One of 'confirmed', 'refuted', 'partial', 'needs_followup'. + summary: What we learned, in one or two sentences. + takeaways: Optional reusable lessons for future workers. + Each item should include: + - claim (required): concrete fact to reuse + - evidence (optional): supporting detail + - how_to_apply_next (optional): instruction for later experiments + - confidence (optional): float in [0, 1] + - tags (optional): short labels like auth/pagination/endpoint + """ + experiment = self._ledger.get_experiment(experiment_id) + if experiment is None: + return {"error": f"No experiment found with ID: {experiment_id}"} + + try: + experiment.verdict = ExperimentVerdict(verdict) + except ValueError: + return { + "error": f"Invalid verdict: {verdict}. " + f"Must be one of: {[v.value for v in ExperimentVerdict]}" + } + + experiment.summary = summary + if takeaways is not None: + parsed_takeaways: list[ExperimentTakeaway] = [] + for idx, raw_takeaway in enumerate(takeaways): + if not isinstance(raw_takeaway, dict): + return {"error": f"takeaways[{idx}] must be an object"} + claim = raw_takeaway.get("claim") + if not isinstance(claim, str) or not claim.strip(): + return {"error": f"takeaways[{idx}].claim is required and must be a non-empty string"} + + confidence = raw_takeaway.get("confidence") + if confidence is not None: + try: + confidence = float(confidence) + except (TypeError, ValueError): + return {"error": f"takeaways[{idx}].confidence must be a float in [0, 1]"} + if confidence < 0 or confidence > 1: + return {"error": f"takeaways[{idx}].confidence must be between 0 and 1"} + + tags_raw = raw_takeaway.get("tags", []) + if tags_raw is None: + tags_raw = [] + if not isinstance(tags_raw, list): + return {"error": f"takeaways[{idx}].tags must be a list of strings"} + + tags: list[str] = [] + for t in tags_raw: + if isinstance(t, str): + tag = t.strip() + if tag: + tags.append(tag) + parsed_takeaways.append( + ExperimentTakeaway( + claim=claim.strip(), + evidence=raw_takeaway.get("evidence"), + how_to_apply_next=raw_takeaway.get("how_to_apply_next"), + confidence=confidence, + tags=tags, + ) + ) + experiment.takeaways = parsed_takeaways + + self._persist(f"finding_{experiment.id}") + return { + "experiment_id": experiment.id, + "verdict": experiment.verdict.value, + "summary": summary, + "takeaway_count": len(experiment.takeaways), + } + + @agent_tool() + def _record_proven_artifact( + self, + artifact_type: str, + details: dict[str, Any], + ) -> dict[str, Any]: + """ + Add a proven artifact to the ledger. Call this when an experiment confirms + a fetch, navigation, token, or parameter that will be part of a routine. + + IMPORTANT: The 'details' parameter MUST be a JSON object (dict), NOT a string. + + Args: + artifact_type: One of 'fetch', 'navigation', 'token', 'parameter'. + details: A JSON object with artifact-specific info. NOT a string. + + Example calls: + record_proven_artifact({ + "artifact_type": "fetch", + "details": {"url": "https://api.example.com/data", "method": "GET", + "headers": {}, "response_preview": "200 OK with JSON"} + }) + record_proven_artifact({ + "artifact_type": "navigation", + "details": {"url": "https://example.com", "sets_up": ["session_cookie"]} + }) + record_proven_artifact({ + "artifact_type": "parameter", + "details": {"name": "seasonId", "type": "number", + "description": "Season identifier", "example_value": 2025} + }) + """ + try: + atype = ArtifactType(artifact_type) + except ValueError: + return { + "error": f"Invalid artifact_type: {artifact_type}. " + f"Must be one of: {[t.value for t in ArtifactType]}" + } + + proven = self._ledger.proven + if atype == ArtifactType.FETCH: + proven.fetches.append(details) + elif atype == ArtifactType.NAVIGATION: + proven.navigations.append(details) + elif atype == ArtifactType.TOKEN: + proven.tokens.append(details) + elif atype == ArtifactType.PARAMETER: + proven.parameters.append(details) + + self._persist(f"artifact_{artifact_type}") + return {"ok": True, "artifact_type": artifact_type, "details": details} + + # =================================================================== + # ROUTINE SUBMISSION TOOLS + # =================================================================== + + @agent_tool(persist=ToolResultPersistMode.ALWAYS, token_optimized=True) + def _submit_routine( + self, + spec_id: str, + routine_json: dict[str, Any], + test_parameters: dict[str, Any], + ) -> dict[str, Any]: + """ + Submit a routine attempt for validation, execution, and inspection. + + Pipeline: validate → execute with test_parameters → inspect → verdict. + + IMPORTANT: You MUST provide test_parameters with realistic values for + every parameter defined in the routine. The routine will be executed + in a live browser and the result sent to an independent inspector. + + The routine_json MUST match the Routine schema (see system prompt). + Key rules: + - Use "operations" (not "steps"), each with a "type" discriminator + - fetch operations need "endpoint": {"url": "...", "method": "GET"} + - Last operation must be "return", "return_html", or "download" + - Must have ≥2 operations (navigate + fetch + return is typical) + - Use {{paramName}} placeholders in URLs/headers/body for parameters + - fetch needs session_storage_key to save results for the return op + + Example: + submit_routine({ + "spec_id": "abc123", + "routine_json": { + "name": "search_examplesite_products", + "description": "Searches the ExampleSite product catalog by query string, returning product names, prices, ratings, and availability.", + "parameters": [ + {"name": "search_query", "type": "string", "description": "Free-text search query to find products (e.g. 'wireless headphones', 'running shoes')"} + ], + "operations": [ + {"type": "navigate", "url": "https://www.example.com"}, + {"type": "fetch", "endpoint": {"url": "https://api.example.com/search?q={{search_query}}", "method": "GET"}, "session_storage_key": "result"}, + {"type": "return", "session_storage_key": "result"} + ] + }, + "test_parameters": {"search_query": "wireless headphones"} + }) + + Args: + spec_id: Which RoutineSpec this routine fulfills. + routine_json: The complete routine dict matching the Routine schema. + test_parameters: Parameter values for test execution. REQUIRED — + must include a value for every parameter in the routine. + """ + spec = self._ledger.get_spec(spec_id) + if spec is None: + return {"error": f"No spec found with ID: {spec_id}"} + + if test_parameters is None: + return { + "error": "test_parameters is required. Provide realistic values " + "for every parameter so the routine can be executed and inspected. " + "If the routine has no parameters, pass an empty object: {}" + } + + # ----- Documentation quality gate (before Pydantic validation) ----- + doc_issues = self._check_routine_documentation_quality(routine_json) + if doc_issues: + return { + "success": False, + "stage": "documentation_quality", + "issues": doc_issues, + "hint": ( + "These routines will be vectorized and stored in databases for other " + "agents to discover and use. Names and descriptions must be precise " + "enough for semantic search and unambiguous enough for autonomous use. " + "Fix the issues above and resubmit." + ), + } + + # ----- Site-credential parameter gate ----- + # Reject parameters that look like site-level API keys / subscription keys. + # These should be hardcoded from captures, not exposed as user parameters. + _CREDENTIAL_PATTERNS = { + "api_key", "apikey", "api-key", "subscription_key", "subscriptionkey", + "subscription-key", "client_secret", "client_id", "app_key", "appkey", + "app_secret", "secret_key", "secretkey", "access_key", "accesskey", + } + params = routine_json.get("parameters", []) + suspect_params = [ + p["name"] for p in params + if isinstance(p, dict) and p.get("name", "").lower().replace("-", "_") in _CREDENTIAL_PATTERNS + ] + if suspect_params: + return { + "success": False, + "stage": "credential_parameter_check", + "suspect_parameters": suspect_params, + "error": ( + f"Parameter(s) {suspect_params} look like site-level API keys or " + "subscription keys. These are NOT user secrets — they are constants " + "baked into the website's JavaScript or network requests. " + "You MUST: (1) find the actual value from captures using " + "get_recorded_transaction to inspect request headers, " + "(2) hardcode it directly in the routine's headers/body, " + "(3) remove it from parameters. " + "Only parameterize values a USER would naturally provide " + "(search terms, dates, IDs, locations)." + ), + } + + # Check attempt limit + existing_attempts = self._ledger.get_attempts_for_spec(spec_id) + if len(existing_attempts) >= self._max_attempts_per_routine: + return { + "error": f"Max attempts ({self._max_attempts_per_routine}) reached for {spec.name}. " + "Consider mark_routine_failed if this routine can't be built." + } + + # ----- Duplicate routine check ----- + # Hash the operations list to detect identical resubmissions. + # This prevents wasting inspector tokens on the exact same broken routine. + new_ops_hash = hashlib.sha256( + json.dumps(routine_json.get("operations", []), sort_keys=True).encode() + ).hexdigest() + for prev_attempt in existing_attempts: + prev_ops_hash = hashlib.sha256( + json.dumps(prev_attempt.routine_json.get("operations", []), sort_keys=True).encode() + ).hexdigest() + if new_ops_hash == prev_ops_hash: + prev_issues = prev_attempt.blocking_issues or [] + return { + "error": ( + f"This routine has IDENTICAL operations to attempt #{prev_attempt.id} " + f"which already FAILED inspection. Resubmitting the same routine wastes " + f"tokens and will produce the same result. You MUST change the operations " + f"to address the previous blocking issues before resubmitting." + ), + "previous_blocking_issues": prev_issues, + "hint": ( + "Review the blocking issues above. Common fixes: add auth/token " + "operations, add missing headers, fix endpoint URLs, resolve " + "placeholder issues. The routine must be STRUCTURALLY DIFFERENT " + "from previous failed attempts." + ), + } + + # Step 1: Validate routine JSON against the Routine model + try: + routine = Routine.model_validate(routine_json) + except ValidationError as e: + issues: list[str] = [] + for err in e.errors(): + loc = ".".join(str(part) for part in err.get("loc", [])) or "root" + msg = err.get("msg", "Invalid value") + issues.append(f"{loc}: {msg}") + return { + "success": False, + "stage": "validation", + "validation_errors": issues or [str(e)], + "issues": issues or [str(e)], + "hint": ( + "Routine validation failed. BEFORE retrying, use your documentation " + "tools to review the correct schema: call search_files(scope='docs', query='Routine operation " + "endpoint fetch', mode='exact') or read_file(scope='docs', path='...') to read the Routine and operation " + "model source code. Key reminders: each operation needs 'type' " + "(e.g. 'fetch'), fetch operations need 'endpoint': {'url': '...', " + "'method': 'GET'}, last operation must be type 'return' or " + "'return_html' or 'download', and the routine needs at least 2 " + "operations. Check the schema in your system prompt carefully." + ), + } + except Exception as e: + return { + "success": False, + "stage": "validation", + "validation_errors": [str(e)], + "issues": [str(e)], + "hint": ( + "Routine validation failed. BEFORE retrying, use your documentation " + "tools to review the correct schema and operation requirements." + ), + } + + # Create attempt record + parent_id = existing_attempts[-1].id if existing_attempts else None + attempt = RoutineAttempt( + routine_spec_id=spec_id, + routine_json=json.loads(routine.model_dump_json()), + status=RoutineAttemptStatus.VALIDATING, + test_parameters=test_parameters, + parent_attempt_id=parent_id, + ) + self._ledger.add_attempt(attempt) + spec.status = RoutineSpecStatus.VALIDATING + self._persist(f"attempt_{attempt.id}_validated") + # Persist an initial attempt record before execution/inspection. + # The final record for this attempt_id will overwrite this file. + self._record_attempt( + spec=spec, + attempt=attempt, + routine_json=routine_json, + test_parameters=test_parameters, + execution_result=None, + inspection_result=None, + ) + + # Step 2: Execute the routine with test parameters + attempt.status = RoutineAttemptStatus.EXECUTING + self._persist(f"attempt_{attempt.id}_executing") + + execution_result = self._execute_routine_with_params(routine, test_parameters) + + if execution_result is not None: + attempt.execution_result = execution_result.model_dump() + if not execution_result.ok: + attempt.execution_error = execution_result.error + logger.warning( + "Routine %s execution failed: %s", spec.name, execution_result.error, + ) + else: + attempt.execution_error = "Execution unavailable (no browser or execution crashed)" + + self._persist(f"attempt_{attempt.id}_executed") + + # Step 3: Send to inspector for quality review + attempt.status = RoutineAttemptStatus.INSPECTING + self._persist(f"attempt_{attempt.id}_inspecting") + + inspection_result = self._run_inspection(routine, execution_result, spec) + + if inspection_result is not None: + # run_autonomous returns a SpecialistResultWrapper dict with the actual + # inspection data nested under "output". Unwrap it so downstream code + # can read overall_pass / blocking_issues / recommendations directly. + inner = inspection_result.get("output", inspection_result) + attempt.inspection_result = inner + attempt.overall_pass = inner.get("overall_pass", False) + attempt.blocking_issues = inner.get("blocking_issues", []) + attempt.recommendations = inner.get("recommendations", []) + + if attempt.overall_pass: + attempt.status = RoutineAttemptStatus.PASSED + else: + attempt.status = RoutineAttemptStatus.FAILED + else: + # Inspector failed — treat as passed with warning (let PI decide) + attempt.status = RoutineAttemptStatus.PASSED + attempt.recommendations = ["Inspector was unavailable — manual review recommended"] + + self._persist(f"attempt_{attempt.id}_inspected") + + # Build response + response: dict[str, Any] = { + "success": True, + "attempt_id": attempt.id, + "spec_id": spec_id, + "operations_count": len(routine.operations), + "parameters_count": len(routine.parameters), + } + + # Prepare execution payload (appended as the LAST response key below) + if execution_result is not None: + execution_payload: dict[str, Any] = execution_result.model_dump(mode="json") + else: + execution_payload = {"ok": False, "error": attempt.execution_error} + + # Inspection summary + if inspection_result is not None: + response["inspection"] = { + "overall_pass": attempt.overall_pass, + "overall_score": inner.get("overall_score"), + "blocking_issues": attempt.blocking_issues, + "recommendations": attempt.recommendations, + "summary": inner.get("summary"), + } + else: + response["inspection"] = {"overall_pass": None, "note": "Inspector unavailable"} + + response["verdict"] = attempt.status.value + + # ----- Remediation hints for failed inspections ----- + if not attempt.overall_pass and attempt.blocking_issues: + hints: list[str] = [] + issues_text = " ".join(attempt.blocking_issues).lower() + if "failed to fetch" in issues_text or "typeerror" in issues_text: + hints.append( + "CORS FIX: Add a 'navigate' operation to the API's allowed origin " + "BEFORE any fetch. Routines start from about:blank (origin=null) so " + "all cross-origin fetches fail. Example: if API is at api.example.com " + "but CORS allows www.example.com, add {\"type\": \"navigate\", " + "\"url\": \"https://www.example.com\"} as the FIRST operation. " + "Review docs: search_files(scope='docs', query='cors-failed-to-fetch', mode='exact')." + ) + if "401" in issues_text or "403" in issues_text or "unauthorized" in issues_text or "access denied" in issues_text: + hints.append( + "AUTH FIX: The routine is missing authentication. Add a fetch " + "operation to obtain a token/key, then a js_evaluate to extract " + "it, then include it in subsequent fetch headers via a " + "sessionStorage placeholder. Review docs: " + "search_files(scope='docs', query='unauthenticated', mode='exact')." + ) + if "documentation quality" in issues_text: + hints.append( + "DOCS FIX: Improve routine name (verb_site_noun, 3+ segments), " + "description (>=8 words, explain action+inputs+outputs), and " + "parameter descriptions (>=3 words, explain where to get values)." + ) + if hints: + response["remediation_hints"] = hints + + # ----- Persist unified attempt record ----- + # Overwrite the initial record for this attempt with final verdict/results. + self._record_attempt( + spec=spec, + attempt=attempt, + routine_json=routine_json, + test_parameters=test_parameters, + execution_result=execution_result, + inspection_result=inspection_result, + ) + + # Keep execution as the final key for readability in tool responses. + response["execution"] = execution_payload + + return response + + @agent_tool() + def _mark_routine_shipped( + self, + spec_id: str, + attempt_id: str, + when_to_use: str, + parameters_summary: list[str] | None = None, + ) -> dict[str, Any]: + """ + Mark a routine as shipped after it passes inspection/validation. + Moves the spec status to "shipped". + + Args: + spec_id: ID of the RoutineSpec. + attempt_id: ID of the RoutineAttempt to ship. + when_to_use: Guidance for the user on when to use this routine. + parameters_summary: Human-readable parameter descriptions. + """ + spec = self._ledger.get_spec(spec_id) + if spec is None: + return {"error": f"No spec found with ID: {spec_id}"} + + attempt = self._ledger.get_attempt(attempt_id) + if attempt is None: + return {"error": f"No attempt found with ID: {attempt_id}"} + + spec.status = RoutineSpecStatus.SHIPPED + spec.shipped_attempt_id = attempt_id + + self._persist(f"shipped_{spec.name}") + return { + "ok": True, + "shipped": spec.name, + "attempt_id": attempt_id, + } + + @agent_tool() + def _mark_routine_failed( + self, + spec_id: str, + reason: str, + ) -> dict[str, Any]: + """ + Give up on a specific routine. Records why it failed. + + Args: + spec_id: ID of the RoutineSpec. + reason: Why this routine can't be built. + """ + spec = self._ledger.get_spec(spec_id) + if spec is None: + return {"error": f"No spec found with ID: {spec_id}"} + + # Reject if already failed — stop the PI from looping on the same spec + if spec.status == RoutineSpecStatus.FAILED: + return { + "error": ( + f"Routine '{spec.name}' is ALREADY marked as failed. " + "Do not call mark_routine_failed again. Move on to the next " + "unaddressed routine — call get_ledger to see which routines " + "still need work, then dispatch_experiment or submit_routine for those." + ) + } + + # Reject if already shipped + if spec.status == RoutineSpecStatus.SHIPPED: + return {"error": f"Routine '{spec.name}' is already shipped. Cannot mark as failed."} + + # Global guardrail: require enough failed routine attempts across the pipeline + failed_attempts_global = sum( + 1 for attempt in self._ledger.attempts + if attempt.status == RoutineAttemptStatus.FAILED + ) + if failed_attempts_global < self._min_global_failed_attempts_before_routine_failure: + return { + "error": ( + f"Cannot mark routine '{spec.name}' as failed yet. " + f"Global failed routine attempts: {failed_attempts_global}/" + f"{self._min_global_failed_attempts_before_routine_failure} required. " + "Keep iterating: submit improved routine attempts, inspect failures, " + "run experiments to delegate exploration to workers when needed, " + "and only mark routines failed after enough global evidence exists." + ) + } + + # Guardrail: require minimum experimentation before giving up + spec_experiments = self._ledger.get_experiments_for_spec(spec_id) + if len(spec_experiments) < self.MIN_EXPERIMENTS_BEFORE_ROUTINE_FAILURE: + return { + "error": ( + f"Cannot mark routine '{spec.name}' as failed after only " + f"{len(spec_experiments)} experiment(s). Try at least " + f"{self.MIN_EXPERIMENTS_BEFORE_ROUTINE_FAILURE} experiments " + "with different approaches before giving up. Consider: CDP-level " + "intercepts, direct navigation to API URLs, or checking the " + "captured session data for working request patterns." + ) + } + + spec.status = RoutineSpecStatus.FAILED + spec.failure_reason = reason + + self._persist(f"failed_{spec.name}") + return {"ok": True, "failed": spec.name, "reason": reason} + + # =================================================================== + # DASHBOARD TOOL + # =================================================================== + + @agent_tool(token_optimized=True) + def _get_ledger(self) -> dict[str, Any]: + """ + Read the full Discovery Ledger — routine specs, experiments, proven + artifacts, attempts, and unresolved questions. Use this to review + progress and decide what to work on next. + """ + return { + "summary": self._ledger.to_summary(), + "total_specs": len(self._ledger.routine_specs), + "shipped": sum( + 1 for s in self._ledger.routine_specs + if s.status == RoutineSpecStatus.SHIPPED + ), + "failed": sum( + 1 for s in self._ledger.routine_specs + if s.status == RoutineSpecStatus.FAILED + ), + "total_experiments": len(self._ledger.experiments), + "confirmed": len(self._ledger.get_confirmed_experiments()), + "total_attempts": len(self._ledger.attempts), + "proven_fetches": len(self._ledger.proven.fetches), + "proven_navigations": len(self._ledger.proven.navigations), + "proven_tokens": len(self._ledger.proven.tokens), + "proven_parameters": len(self._ledger.proven.parameters), + "unresolved": self._ledger.unresolved, + } + + # =================================================================== + # TERMINATION TOOLS + # =================================================================== + + @agent_tool() + def _mark_complete(self, usage_guide: str) -> dict[str, Any]: + """ + Signal that the pipeline is done. Call this when ALL routines + have been addressed (shipped or failed). + + Provides a usage_guide string explaining how to use the routines + together and when to use each one. Builds the final RoutineCatalog. + + Args: + usage_guide: How to use these routines together. Include: + - What each routine does + - When to use each one + - How they relate to each other + - What parameters each expects + """ + # Guardrail: reject if routines are still unaddressed + unaddressed = [ + s for s in self._ledger.routine_specs + if s.status not in (RoutineSpecStatus.SHIPPED, RoutineSpecStatus.FAILED) + ] + shipped_count = sum( + 1 for s in self._ledger.routine_specs + if s.status == RoutineSpecStatus.SHIPPED + ) + if unaddressed: + unaddressed_names = [f"{s.name} ({s.status.value})" for s in unaddressed] + return { + "error": ( + f"Cannot mark complete — {len(unaddressed)} routine(s) are still unaddressed: " + f"{', '.join(unaddressed_names)}. " + "Each routine must be either shipped (via submit_routine → mark_routine_shipped) " + "or explicitly failed (via mark_routine_failed) before calling mark_complete. " + "You must build and submit actual routine JSON with test_parameters for each routine." + ) + } + + # Guardrail: reject if nothing was shipped at all + if shipped_count == 0: + return { + "error": ( + "Cannot mark complete with 0 shipped routines. At least one routine " + "must be successfully built, submitted, and shipped. Use submit_routine " + "with a complete routine_json and test_parameters to create routine attempts." + ) + } + + catalog = self._build_catalog(usage_guide) + self._ledger.catalog = catalog + self._pipeline_result = catalog + self._is_done = True + + self._persist("complete") + return { + "status": "complete", + "routines_shipped": len(catalog.routines), + "routines_failed": len(catalog.failed_routines), + "total_experiments": catalog.total_experiments, + "total_attempts": catalog.total_attempts, + } + + @agent_tool() + def _mark_failed(self, reason: str) -> dict[str, Any]: + """ + Signal that the pipeline has failed — can't build ANY routines at all. + + Args: + reason: Why construction failed entirely. + """ + # Guardrail: prevent premature pipeline abandonment + total_experiments = len(self._ledger.experiments) + unaddressed_specs = [ + s for s in self._ledger.routine_specs + if s.status not in (RoutineSpecStatus.SHIPPED, RoutineSpecStatus.FAILED) + ] + if total_experiments < self._min_experiments_before_fail and unaddressed_specs: + return { + "error": ( + f"Cannot mark pipeline as failed after only {total_experiments} experiment(s). " + f"You have {len(unaddressed_specs)} unaddressed routine(s). " + "Try alternative approaches: use search_recorded_transactions to find working " + "request patterns, use browser_cdp_command for CDP-level intercepts, or " + "navigate directly to API URLs. Mark individual routines as failed with " + "mark_routine_failed if they truly can't be built, then call mark_complete." + ) + } + + self._is_done = True + self._pipeline_result = None + logger.warning("PI marked pipeline as failed: %s", reason) + + self._persist("failed") + return {"status": "failed", "reason": reason} + + # =================================================================== + # Internal — catalog building + # =================================================================== + + def _build_catalog(self, usage_guide: str) -> RoutineCatalog: + """Build a RoutineCatalog from the current ledger state.""" + shipped_routines: list[ShippedRoutine] = [] + failed_routines: list[dict[str, Any]] = [] + + for spec in self._ledger.routine_specs: + if spec.status == RoutineSpecStatus.SHIPPED and spec.shipped_attempt_id: + attempt = self._ledger.get_attempt(spec.shipped_attempt_id) + if attempt: + routine_name = attempt.routine_json.get("name") or spec.name + routine_description = attempt.routine_json.get("description") or spec.description + shipped_routines.append(ShippedRoutine( + routine_spec_id=spec.id, + routine_json=attempt.routine_json, + name=routine_name, + description=routine_description, + when_to_use=f"Use to {routine_description.lower()}", + parameters_summary=[], + inspection_score=attempt.inspection_result.get("overall_score", 0) + if attempt.inspection_result else 0, + )) + elif spec.status == RoutineSpecStatus.FAILED: + failed_routines.append({ + "name": spec.name, + "description": spec.description, + "reason": spec.failure_reason or "Unknown", + }) + + # Infer site from exploration summaries or first URL + site = "unknown" + for summary_text in self._exploration_summaries_raw.values(): + if "://" in summary_text: + # Try to extract domain + match = re.search(r'https?://([^/\s]+)', summary_text) + if match: + site = match.group(1) + break + + return RoutineCatalog( + site=site, + user_task=self._task, + routines=shipped_routines, + usage_guide=usage_guide, + failed_routines=failed_routines, + total_experiments=len(self._ledger.experiments), + total_attempts=len(self._ledger.attempts), + ) + + def _build_partial_catalog(self) -> RoutineCatalog | None: + """Build a partial catalog from whatever has been shipped so far.""" + shipped = [ + s for s in self._ledger.routine_specs + if s.status == RoutineSpecStatus.SHIPPED + ] + if not shipped: + return None + return self._build_catalog( + "Pipeline hit iteration limit. These routines were completed." + ) + + # =================================================================== + # Internal — worker management + # =================================================================== + + def _create_worker(self) -> ExperimentWorker: + """Create a new ExperimentWorker instance with all available context.""" + worker_workspace = ( + self._worker_workspace_factory() + if self._worker_workspace_factory is not None + else None + ) + return ExperimentWorker( + emit_message_callable=self._emit_message_callable, + # Browser context + remote_debugging_address=self._remote_debugging_address, + # Capture data loaders + network_data_loader=self._network_data_loader, + storage_data_loader=self._storage_data_loader, + dom_data_loader=self._dom_data_loader, + window_property_data_loader=self._window_property_data_loader, + # Config + llm_model=self._worker_llm_model, + workspace=worker_workspace, + ) + + def _create_inspector(self) -> RoutineInspector: + """Create a new RoutineInspector instance.""" + inspector_workspace = ( + self._inspector_workspace_factory() + if self._inspector_workspace_factory is not None + else None + ) + return RoutineInspector( + emit_message_callable=self._emit_message_callable, + llm_model=self._worker_llm_model, + documentation_data_loader=self._documentation_data_loader, + workspace=inspector_workspace, + ) + + def _get_or_create_agent(self, task: Task) -> AbstractAgent: + """ + Get existing agent instance or create/reuse one for the task. + + Workers are capped at num_workers. Once the pool is full, new tasks + are assigned round-robin to existing workers (each gets a fresh + autonomous run but the PI can still dispatch new experiments to the same worker). + """ + if task.agent_id and task.agent_id in self._agent_instances: + return self._agent_instances[task.agent_id] + + # Check if we can reuse an existing worker (pool is full) + worker_ids = [ + sid for sid, agent in self._agent_instances.items() + if isinstance(agent, ExperimentWorker) + ] + + if len(worker_ids) >= self._num_workers: + # Round-robin to existing workers + reuse_id = worker_ids[self._worker_counter % len(worker_ids)] + self._worker_counter += 1 + task.agent_id = reuse_id + subagent = self._orchestration_state.subagents.get(reuse_id) + if subagent: + subagent.task_ids.append(task.id) + # Close old browser tab — _ensure_browser will create a fresh one + worker = self._agent_instances[reuse_id] + if isinstance(worker, ExperimentWorker): + worker.close() + return worker + + # Create new worker + agent = self._create_worker() + + subagent = SubAgent( + type=task.agent_type, + llm_model=self._worker_llm_model.value, + ) + self._orchestration_state.subagents[subagent.id] = subagent + self._agent_instances[subagent.id] = agent + + task.agent_id = subagent.id + subagent.task_ids.append(task.id) + + # Wire up real-time thread persistence + agent_label = f"worker_{subagent.id}" + agent._on_chat_added = lambda _chat: self._dump_agent_thread(agent_label, agent) + + return agent + + def _get_or_create_inspector(self) -> RoutineInspector: + """ + Get an existing inspector instance or create one. + + Inspectors are capped at num_inspectors. Once the pool is full, + existing inspectors are reused round-robin (each gets a fresh + autonomous run via reset). + """ + inspector_ids = [ + sid for sid, agent in self._agent_instances.items() + if isinstance(agent, RoutineInspector) + ] + + if len(inspector_ids) < self._num_inspectors: + # Pool not full — create a new inspector + inspector = self._create_inspector() + subagent = SubAgent( + type=SpecialistAgentType.ROUTINE_INSPECTOR, + llm_model=self._worker_llm_model.value, + ) + self._orchestration_state.subagents[subagent.id] = subagent + self._agent_instances[subagent.id] = inspector + # Wire up real-time thread persistence + inspector_label = f"inspector_{subagent.id}" + inspector._on_chat_added = lambda _chat: self._dump_agent_thread(inspector_label, inspector) + return inspector + + # Pool full — reuse round-robin with fresh conversation + reuse_id = inspector_ids[self._inspector_counter % len(inspector_ids)] + self._inspector_counter += 1 + inspector = self._agent_instances[reuse_id] + assert isinstance(inspector, RoutineInspector) + inspector.reset() + return inspector + + # =================================================================== + # Internal — routine execution and inspection + # =================================================================== + + def _execute_routine_with_params( + self, + routine: Routine, + test_parameters: dict[str, Any] | None, + ) -> RoutineExecutionResultWithMetadata | None: + """Execute a routine with test parameters in a live browser.""" + if not self._remote_debugging_address: + logger.warning("No remote_debugging_address — skipping routine execution") + return None + + try: + result = routine.execute( + parameters_dict=test_parameters, + remote_debugging_address=self._remote_debugging_address, + timeout=120.0, + close_tab_when_done=True, + incognito=True, + ) + return result + except Exception as e: + logger.error("Routine execution failed: %s", e) + return None + + def _run_inspection( + self, + routine: Routine, + execution_result: RoutineExecutionResultWithMetadata | None, + spec: RoutineSpec, + ) -> dict[str, Any] | None: + """Run a RoutineInspector on a routine + execution result.""" + inspector = self._get_or_create_inspector() + + # Build inspection prompt with all context + # NOTE: User task is intentionally excluded — the inspector should judge + # the routine on its own merits (correctness, robustness, data quality), + # not whether it fulfills the user's high-level goal. + prompt_parts: list[str] = [ + f"## Routine Name\n{routine.name}\n", + f"## Routine Description\n{routine.description}\n", + f"## Routine JSON\n```json\n{json.dumps(routine.model_dump(), indent=2, default=str)}\n```\n", + ] + + if execution_result is not None: + exec_data = execution_result.model_dump(mode="json") + exec_json = json.dumps(exec_data, indent=2, default=str) + + persisted_for_inspector = False + if ( + len(exec_json) > self.INSPECTOR_INLINE_EXECUTION_MAX_CHARS + and inspector.has_workspace + ): + try: + inspector_workspace = inspector._require_workspace() + inspector_workspace.ensure_dirs() + safe_spec = re.sub(r"[^a-zA-Z0-9_.-]+", "_", spec.name).strip("_") + if not safe_spec: + safe_spec = spec.id + artifact_ref = inspector_workspace.save_artifact( + source="raw", + filename=f"{safe_spec}_execution_result.json", + content=exec_json, + tool_name="pi_run_inspection", + content_type="json", + metadata={ + "spec_id": spec.id, + "spec_name": spec.name, + "char_count": len(exec_json), + }, + ) + prompt_parts.append( + "## Execution Result\n" + f"Execution payload is large ({len(exec_json)} chars) and was saved to:\n" + f"- workspace path: `{artifact_ref.relative_path}`\n" + f"- artifact_id: `{artifact_ref.artifact_id}`\n\n" + "Use `execute_python` or `read_file(scope=\"workspace\", path=\"...\")` to inspect " + "this file directly and base your judgment on the full payload.\n" + "Do not claim truncation — the full execution result is available in workspace raw/.\n" + ) + persisted_for_inspector = True + except Exception as e: + logger.warning( + "Failed to persist large execution payload for inspector; falling back to inline JSON: %s", + e, + ) + + if not persisted_for_inspector: + prompt_parts.append( + f"## Execution Result\n```json\n{exec_json}\n```\n" + ) + else: + prompt_parts.append("## Execution Result\nNot available (no browser or execution failed).\n") + + base_prompt = "\n".join(prompt_parts) + + # Add exploration summaries for cross-reference. If the combined prompt gets too + # large, drop exploration summaries first. Do NOT truncate execution payload. + exploration_section = "" + if self._exploration_summaries: + exploration_parts: list[str] = ["## Exploration Summaries\n"] + for domain, summary in self._exploration_summaries.items(): + exploration_parts.append(f"### {domain}\n{summary}\n") + exploration_section = "\n".join(exploration_parts) + + inspection_prompt = ( + f"{base_prompt}\n\n{exploration_section}" + if exploration_section + else base_prompt + ) + + max_chars = 120_000 + if len(inspection_prompt) > max_chars and exploration_section: + logger.warning( + "Inspection prompt too large (%d chars); omitting exploration summaries to preserve full execution payload", + len(inspection_prompt), + ) + inspection_prompt = ( + f"{base_prompt}\n\n" + "## Exploration Summaries\n" + "[omitted due prompt size; consult persisted exploration artifacts if needed]\n" + ) + + if len(inspection_prompt) > max_chars: + logger.warning( + "Inspection prompt still large after omitting summaries (%d chars); sending full routine + execution payload without truncation", + len(inspection_prompt), + ) + + try: + config = AutonomousRunConfig(min_iterations=1, max_iterations=10) + + # Run inspector with timeout to prevent indefinite hangs + with ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit( + inspector.run_autonomous, + task=inspection_prompt, + config=config, + output_schema=RoutineInspectionResult.model_json_schema(), + output_description="RoutineInspectionResult with scores, blocking issues, and verdict", + ) + try: + result = future.result(timeout=self.WORKER_TIMEOUT_SECONDS) + except FuturesTimeoutError: + logger.error( + "Inspector timed out for %s after %ds", spec.name, self.WORKER_TIMEOUT_SECONDS, + ) + self._dump_agent_thread(f"inspector_{spec.name}", inspector) + return None + + # Dump inspector thread + self._dump_agent_thread(f"inspector_{spec.name}", inspector) + + if result is not None: + return result.model_dump() if isinstance(result, BaseModel) else result + return None + except Exception as e: + logger.error("Inspection failed for %s: %s", spec.name, e) + return None + + def _execute_task(self, task: Task) -> dict[str, Any]: + """Execute a task using an ExperimentWorker with a timeout guard.""" + task.status = TaskStatus.IN_PROGRESS + task.started_at = datetime.now() + + try: + agent = self._get_or_create_agent(task) + + remaining_loops = task.max_loops - task.loops_used + if remaining_loops <= 0: + task.status = TaskStatus.FAILED + task.error = "No loops remaining" + return {"success": False, "error": "No loops remaining"} + + config = AutonomousRunConfig( + min_iterations=1, + max_iterations=remaining_loops, + ) + + # Run with timeout to prevent indefinite hangs (LLM or browser) + with ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit( + agent.run_autonomous, + task=task.prompt, + config=config, + output_schema=task.output_schema, + output_description=task.output_description, + ) + try: + result = future.result(timeout=self.WORKER_TIMEOUT_SECONDS) + except FuturesTimeoutError: + logger.error( + "Task %s timed out after %ds", task.id, self.WORKER_TIMEOUT_SECONDS, + ) + task.status = TaskStatus.FAILED + task.error = f"Worker timed out after {self.WORKER_TIMEOUT_SECONDS}s" + task.completed_at = datetime.now() + self._dump_agent_thread(f"worker_{task.agent_id}", agent) + return {"success": False, "error": task.error} + + task.loops_used += agent.autonomous_iteration + + # Dump the worker's full message history for debugging + self._dump_agent_thread(f"worker_{task.agent_id}", agent) + + if result is not None: + task.status = TaskStatus.COMPLETED + task.completed_at = datetime.now() + task.result = result.model_dump() if isinstance(result, BaseModel) else result + return {"success": True, "result": task.result} + else: + if task.loops_used < task.max_loops: + task.status = TaskStatus.PAUSED + return {"success": False, "status": "paused", "loops_used": task.loops_used} + else: + task.status = TaskStatus.FAILED + task.error = "Max loops reached without result" + return {"success": False, "error": task.error} + + except Exception as e: + task.status = TaskStatus.FAILED + task.error = str(e) + task.completed_at = datetime.now() + logger.error("Task %s failed: %s", task.id, e) + return {"success": False, "error": str(e)} + + # =================================================================== + # Internal — documentation quality checks + # =================================================================== + + @staticmethod + def _check_routine_documentation_quality(routine_json: dict[str, Any]) -> list[str]: + """ + Validate that routine metadata is detailed enough for vectorized storage + and discovery by other agents. Returns a list of issues (empty = pass). + """ + issues: list[str] = [] + + # --- Routine name --- + name = routine_json.get("name", "") + if not name: + issues.append("Routine name is missing.") + else: + # Must be snake_case (lowercase + underscores) + if not re.match(r'^[a-z][a-z0-9]*(_[a-z0-9]+)*$', name): + issues.append( + f"Routine name '{name}' must be snake_case (e.g. 'get_premierleague_standings', " + "'search_amtrak_trains'). No camelCase, no spaces, no uppercase." + ) + # Must be descriptive — at least 3 underscore-separated segments + # (verb + site/context + noun, e.g. get_premierleague_standings) + segments = name.split("_") + if len(segments) < 3: + issues.append( + f"Routine name '{name}' needs more context ({len(segments)} segments, need ≥3). " + "The name must include the site/service so it makes sense in isolation. " + "Pattern: verb_site_noun (e.g. 'get_premierleague_standings', " + "'search_espn_scores', 'fetch_amtrak_schedules'). " + "Another agent reading ONLY the name should know what site this targets." + ) + # Reject overly generic names that lack site context + _GENERIC_NOUNS = { + "data", "items", "item", "content", "results", "result", + "info", "details", "list", "response", "output", "records", + } + # Check if the non-verb segments are all generic + non_verb_segments = segments[1:] if len(segments) > 1 else [] + if non_verb_segments and all(seg in _GENERIC_NOUNS for seg in non_verb_segments): + issues.append( + f"Routine name '{name}' uses only generic nouns ({non_verb_segments}). " + "Include the site/domain name and a specific noun. " + "Example: 'get_content_item' → 'get_premierleague_article', " + "'fetch_data' → 'fetch_espn_game_scores'." + ) + + # --- Routine description --- + desc = routine_json.get("description", "") + if not desc: + issues.append("Routine description is missing.") + else: + word_count = len(desc.split()) + if word_count < 8: + issues.append( + f"Routine description is too short ({word_count} words). Must be ≥8 words. " + "Describe: what the routine does, what inputs it takes, and what data it returns. " + "Example: 'Fetches Premier League standings for a given competition and season, " + "returning team names, positions, wins, draws, losses, and points.'" + ) + # Should mention what it returns + desc_lower = desc.lower() + return_keywords = ("return", "fetch", "retriev", "get", "extract", "download", "output", "produc", "yield") + if not any(kw in desc_lower for kw in return_keywords): + issues.append( + "Routine description should explain what data it returns. " + "Include words like 'returns', 'fetches', 'retrieves', or 'extracts'." + ) + + # --- Parameter descriptions --- + # Suffixes/keywords that signal an opaque, non-obvious parameter value + _OPAQUE_SIGNALS = ("_id", "_ids", "_slug", "_code", "_token", "_key", "_hash", "_uuid") + _SOURCE_KEYWORDS = ( + "obtain", "from the", "get from", "found in", "returned by", + "use the", "listed by", "provided by", "available via", "see the", + "look up", "call the", "via the", "endpoint", "routine", + ) + + params = routine_json.get("parameters", []) + for param in params: + if not isinstance(param, dict): + continue + pname = param.get("name", "unknown") + pdesc = param.get("description", "") + if not pdesc: + issues.append(f"Parameter '{pname}' is missing a description.") + continue + + if len(pdesc.split()) < 3: + issues.append( + f"Parameter '{pname}' description is too terse: '{pdesc}'. " + "Descriptions must be ≥3 words and explain what the value represents " + "and its expected format (e.g. 'The unique season identifier, typically a 4-digit year like 2024')." + ) + continue + + # Check if this looks like an opaque/non-obvious parameter + pname_lower = pname.lower() + ptype = param.get("type", "string") + is_opaque = ( + any(pname_lower.endswith(sig) for sig in _OPAQUE_SIGNALS) + or ptype in ("integer", "number") and pname_lower.endswith("id") + ) + + if is_opaque: + pdesc_lower = pdesc.lower() + has_source = any(kw in pdesc_lower for kw in _SOURCE_KEYWORDS) + if not has_source: + issues.append( + f"Parameter '{pname}' looks like an opaque/internal identifier but its " + f"description doesn't explain WHERE to get valid values. " + f"Current description: '{pdesc}'. " + "For non-obvious IDs, slugs, and codes, the description MUST say how " + "to obtain valid values — e.g. 'Obtain from the get_competitions routine' " + "or 'Found in the /api/seasons endpoint response'." + ) + + return issues + + def close(self) -> None: + """Clean up all worker agent instances.""" + for agent in self._agent_instances.values(): + if hasattr(agent, "close"): + try: + agent.close() + except Exception: + pass + self._agent_instances.clear() diff --git a/bluebox/agents/routine_discovery_agent_beta.py b/bluebox/agents/routine_discovery_agent_beta.py deleted file mode 100644 index 8eae08d9..00000000 --- a/bluebox/agents/routine_discovery_agent_beta.py +++ /dev/null @@ -1,2157 +0,0 @@ -""" -bluebox/agents/routine_discovery_agent_beta.py - -RoutineDiscoveryAgentBeta - orchestrator for routine discovery. - -This agent coordinates specialist subagents (JSSpecialist, NetworkSpecialist, etc.) -to discover routines from CDP captures. It delegates specific tasks to specialists -while managing the overall discovery workflow: - -1. PLANNING: Analyze task, plan approach -2. DISCOVERING: Delegate discovery tasks to specialists -3. CONSTRUCTING: Build routine from discoveries -4. VALIDATING: Test the constructed routine -5. COMPLETE/FAILED: Finish discovery - -The agent inherits from AbstractAgent for LLM/chat/tool infrastructure. -""" - -from __future__ import annotations - -import json -from concurrent.futures import ThreadPoolExecutor, as_completed -from datetime import datetime -from textwrap import dedent -from typing import Any, Callable - -from pydantic import BaseModel - -from bluebox.agents.abstract_agent import AbstractAgent, AgentCard, agent_tool -from bluebox.agents.specialists.abstract_specialist import AbstractSpecialist, AutonomousConfig, RunMode -from bluebox.agents.specialists.js_specialist import JSSpecialist -from bluebox.agents.specialists.network_specialist import NetworkSpecialist -from bluebox.agents.specialists.value_trace_resolver_specialist import ValueTraceResolverSpecialist -from bluebox.agents.specialists.interaction_specialist import InteractionSpecialist -from bluebox.data_models.llms.interaction import ( - Chat, - ChatRole, - ChatThread, - EmittedMessage, - ChatResponseEmittedMessage, - ErrorEmittedMessage, -) -from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel -from bluebox.data_models.orchestration.task import Task, SubAgent, TaskStatus, SpecialistAgentType -from bluebox.data_models.orchestration.state import AgentOrchestrationState -from bluebox.data_models.routine.endpoint import HTTPMethod -from bluebox.data_models.routine.routine import Routine -from bluebox.data_models.routine_discovery.state import RoutineDiscoveryState, DiscoveryPhase -from bluebox.data_models.routine_discovery.llm_responses import ( - TransactionIdentificationResponse, - Variable, - VariableType, - ExtractedVariableResponse, - ResolvedVariableResponse, - SessionStorageSource, - TransactionSource, - WindowPropertySource, - SessionStorageType, -) -from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader -from bluebox.llms.data_loaders.interactions_data_loader import InteractionsDataLoader -from bluebox.llms.data_loaders.js_data_loader import JSDataLoader -from bluebox.llms.data_loaders.network_data_loader import NetworkDataLoader -from bluebox.llms.data_loaders.storage_data_loader import StorageDataLoader -from bluebox.llms.data_loaders.window_property_data_loader import WindowPropertyDataLoader -from bluebox.utils.data_utils import resolve_dotted_path -from bluebox.utils.logger import get_logger - -logger = get_logger(name=__name__) - - -class RoutineDiscoveryAgentBeta(AbstractAgent): - """ - Orchestrator agent that coordinates specialist subagents for routine discovery. - - Unlike specialists which do focused work, this agent plans and delegates: - - Creates tasks for specialists to handle - - Runs tasks and collects results - - Uses results to construct routines - """ - - AGENT_CARD = AgentCard( - description=( - "Orchestrates routine discovery by coordinating specialist subagents. " - "Delegates network analysis, value tracing, JS generation, and interaction " - "analysis to specialists, then assembles the results into a routine." - ), - ) - - ## System prompts — phase-scoped sections - - # Core identity + delegation rules (included in every phase) - PROMPT_CORE: str = dedent("""\ - You are an expert at analyzing network traffic and building web automation routines. - You coordinate specialist agents to discover and construct routines. - - ## Your Task - Analyze captured browser network data to create a reusable routine that accomplishes the user's task. - - ## CRITICAL: You MUST Delegate to Specialists - - **DO NOT** try to do everything yourself with direct tools. You are an ORCHESTRATOR. - Your job is to coordinate specialists, not to manually inspect every transaction. - - **How to delegate:** - 1. `create_task(agent_type="network_specialist", prompt="...")` - 2. `run_pending_tasks()` - 3. `get_task_result(task_id)` to review findings - - ## Important Notes - - Focus on the user's INTENT, not literal wording - - Keep parameters MINIMAL - only what the user MUST provide - - If only one value was observed and it could be hardcoded, hardcode it - - Credentials for fetch operations: same-origin > include > omit - """) - - # Phase-specific instructions (only the active phase's block is included) - PROMPT_PLANNING: str = dedent("""\ - ## Current Phase: PLANNING — Identify the Target Endpoint - - 1. **REQUIRED**: Create a task for network_specialist to find the endpoint: - ``` - create_task( - agent_type="network_specialist", - prompt="Find the API endpoint that accomplishes: . Search for relevant keywords." - ) - ``` - 2. Call `run_pending_tasks()` to execute - 3. Review results with `get_task_result(task_id)` - 4. Use `record_identified_endpoint` with the specialist's findings - """) - - PROMPT_DISCOVERING: str = dedent("""\ - ## Current Phase: DISCOVERING — Process Transactions (BFS Queue) - - For each transaction in the queue: - 1. Use `get_transaction` to see full details - 2. Use `record_extracted_variable` to log variables found in the request - 3. **For DYNAMIC_TOKENs — DELEGATE TO value_trace_resolver**: - ``` - create_task( - agent_type="value_trace_resolver", - prompt="Trace the origin of value '' (variable: ). Find where it comes from." - ) - ``` - 4. Call `run_pending_tasks()` then `get_task_result(task_id)` to get findings - 5. Use `record_resolved_variable` to record where each token comes from - - If source is another transaction, it will be auto-added to the queue - - PREFER NETWORK SOURCES: When a value appears in both session storage AND a prior - transaction response, use source_type='transaction' as the PRIMARY source. - Session storage may be empty in a fresh session. - 6. Use `mark_transaction_processed` when done with a transaction - 7. Continue until queue is empty - - ## Variable Classification Rules - - **PARAMETER** (requires_dynamic_resolution=false): - - Values the user explicitly provides as input - - Examples: search_query, item_id, page_number, username - - Rule: If the user wouldn't directly provide this value, it's NOT a parameter - - **DYNAMIC_TOKEN** (requires_dynamic_resolution=true): - - Auth/session values that change per session - - Examples: CSRF tokens, JWTs, session_id, visitorData, auth headers - - Also: trace IDs, request IDs, correlation IDs - - Rule: If it looks like a generated ID or security token, it's a DYNAMIC_TOKEN - - **STATIC_VALUE** (requires_dynamic_resolution=false): - - Constants that don't change between sessions - - Examples: App version, User-Agent, clientName, timeZone, language codes - - Rule: If you can hardcode it and it will work across sessions, it's STATIC - """) - - PROMPT_CONSTRUCTING: str = dedent("""\ - ## Current Phase: CONSTRUCTING — Build the Routine - - 1. Use `get_discovery_context` to see all processed data (includes CRITICAL_OBSERVED_VALUES) - 2. Review the **Routine Schema Reference** below for required fields and operation types - 3. Use `construct_routine` with the routine definition: - - `routine`: the routine definition (name, description, parameters, operations) - - **If browser is connected (validation available):** - 4. After constructing, use `validate_routine` with test_parameters (observed values) - 5. Use `analyze_validation` to reflect on results (REQUIRED before done) - - **If NO browser connected:** - 4. Call `done` directly after construct_routine - - ## Operation Ordering - - Routines typically start with a `navigate` operation to load the target page before - performing any other operations. This is important because: - - `fetch` operations run in the page's JS context — without navigating first, the - browser has no origin, so requests fail with CORS errors. - - Similar situations apply to `js_evaluate`. - - Click/input/scroll operations need a loaded DOM to interact with. - - Look at the root transaction's URL to determine the base URL to navigate to (usually - the origin, e.g. `https://example.com`). - """) - - PLACEHOLDER_INSTRUCTIONS: str = ( - "## Placeholder Syntax\n" - "ALL placeholders use {{param_name}} — the parameter's `type` field drives type coercion at resolution time.\n\n" - "- PARAMS: {{param_name}} (NO prefix, name matches parameter definition)\n" - "- SOURCES (use dot paths): {{cookie:name}}, {{sessionStorage:path.to.value}}, " - "{{localStorage:key}}, {{windowProperty:obj.key}}\n\n" - "EXAMPLES:\n" - '1. String param: "name": "{{username}}" -> "name": "john" (type=string)\n' - '2. Number param: "count": "{{limit}}" -> "count": 50 (type=integer)\n' - '3. Bool param: "active": "{{is_active}}" -> "active": true (type=boolean)\n' - '4. In URL: "/api/{{user_id}}/data" -> "/api/123/data"\n' - '5. Session storage: "token": "{{sessionStorage:auth.access_token}}"\n' - '6. Cookie: "sid": "{{cookie:session_id}}"\n\n' - "CRITICAL — MATCH TYPES TO THE RAW CDP REQUEST:\n" - 'If the raw CDP request has "adults": "5" (a string), use type=string, NOT type=integer.\n' - "Integer would produce 5 (unquoted) and may break the API. Always match the type observed in the actual request." - ) - - PROMPT_VALIDATING: str = dedent("""\ - ## Current Phase: VALIDATING — Test the Routine - - 1. Review the `validate_routine` execution results - 2. Use `analyze_validation` to reflect: - - `analysis`: What worked and what failed - - `data_matches_task`: Does the returned data accomplish the user's original task? - - `next_action`: "done" | "fix_routine" | "retry_validation" - 3. Based on your analysis: - - If data_matches_task=True and next_action="done": call `done` - - If data_matches_task=False: set next_action="fix_routine", then use construct_routine to fix and re-validate - """) - - ## Magic methods - - def __init__( - self, - emit_message_callable: Callable[[EmittedMessage], None], - network_data_loader: NetworkDataLoader, - task: str, - storage_data_loader: StorageDataLoader | None = None, - window_property_data_loader: WindowPropertyDataLoader | None = None, - js_data_loader: JSDataLoader | None = None, - interaction_data_loader: InteractionsDataLoader | None = None, - documentation_data_loader: DocumentationDataLoader | None = None, - llm_model: LLMModel = OpenAIModel.GPT_5_2, - subagent_llm_model: LLMModel | None = None, - max_iterations: int = 50, - remote_debugging_address: str | None = None, - persist_chat_callable: Callable[[Chat], Chat] | None = None, - persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None, - stream_chunk_callable: Callable[[str], None] | None = None, - chat_thread: ChatThread | None = None, - existing_chats: list[Chat] | None = None, - ) -> None: - """ - Initialize the RoutineDiscoveryAgentBeta. - - Args: - emit_message_callable: Callback to emit messages to the host. - network_data_loader: NetworkDataLoader with network traffic data. - task: The discovery task description. - storage_data_loader: Optional StorageDataLoader for browser storage. - window_property_data_loader: Optional WindowPropertyDataLoader for window properties. - js_data_loader: Optional JSDataLoader for JavaScript files. - interaction_data_loader: Optional InteractionsDataLoader for interaction events. - documentation_data_loader: Optional DocumentationDataLoader for docs and code files. - llm_model: LLM model for the orchestrator. - subagent_llm_model: LLM model for subagents (defaults to orchestrator's model). - max_iterations: Maximum iterations for the main loop. - remote_debugging_address: Chrome remote debugging address for validation. - persist_chat_callable: Optional callback to persist Chat objects. - persist_chat_thread_callable: Optional callback to persist ChatThread. - stream_chunk_callable: Optional callback for streaming text chunks. - chat_thread: Existing ChatThread to continue, or None for new. - existing_chats: Existing Chat messages if loading from persistence. - """ - self._network_data_loader = network_data_loader - self._storage_data_loader = storage_data_loader - self._window_property_data_loader = window_property_data_loader - self._js_data_loader = js_data_loader - self._interaction_data_loader = interaction_data_loader - self._documentation_data_loader = documentation_data_loader - self._task = task - self._subagent_llm_model = subagent_llm_model or llm_model - self._max_iterations = max_iterations - self._remote_debugging_address = remote_debugging_address - - # Internal state - self._orchestration_state = AgentOrchestrationState() - self._discovery_state = RoutineDiscoveryState(phase=DiscoveryPhase.PLANNING) - self._agent_instances: dict[str, AbstractSpecialist] = {} # agent_id -> instance - - # Result tracking - self._final_routine: Routine | None = None - self._failure_reason: str | None = None - - super().__init__( - emit_message_callable=emit_message_callable, - persist_chat_callable=persist_chat_callable, - persist_chat_thread_callable=persist_chat_thread_callable, - stream_chunk_callable=stream_chunk_callable, - llm_model=llm_model, - chat_thread=chat_thread, - existing_chats=existing_chats, - documentation_data_loader=documentation_data_loader, - ) - - ## Abstract method implementations - - def _get_system_prompt(self) -> str: - """Build the system prompt scoped to the current phase.""" - phase = self._discovery_state.phase - - # Core identity + delegation rules (always included) - prompt_parts = [self.PROMPT_CORE] - - # Inject specialist descriptions from AgentCard metadata - specialist_lines = [ - f"- `{agent_type.value}`: {cls.AGENT_CARD.description}" - for agent_type, cls in ( - (SpecialistAgentType.NETWORK_SPECIALIST, NetworkSpecialist), - (SpecialistAgentType.VALUE_TRACE_RESOLVER, ValueTraceResolverSpecialist), - (SpecialistAgentType.JS_SPECIALIST, JSSpecialist), - (SpecialistAgentType.INTERACTION_SPECIALIST, InteractionSpecialist), - ) - ] - prompt_parts.append("\n\n**Available specialists:**\n" + "\n".join(specialist_lines)) - - # Phase-specific instructions - if phase == DiscoveryPhase.PLANNING: - prompt_parts.append(self.PROMPT_PLANNING) - elif phase == DiscoveryPhase.DISCOVERING: - prompt_parts.append(self.PROMPT_DISCOVERING) - elif phase == DiscoveryPhase.CONSTRUCTING: - prompt_parts.append(self.PROMPT_CONSTRUCTING) - prompt_parts.append(self.PLACEHOLDER_INSTRUCTIONS) - prompt_parts.append(Routine.model_schema_markdown()) - elif phase == DiscoveryPhase.VALIDATING: - prompt_parts.append(self.PROMPT_VALIDATING) - prompt_parts.append(self.PLACEHOLDER_INSTRUCTIONS) # needed if fix_routine - prompt_parts.append(Routine.model_schema_markdown()) # needed if fix_routine - - # Add data store summaries - data_loader_info = [] - if self._network_data_loader: - stats = self._network_data_loader.stats - data_loader_info.append(f"Network: {stats.total_requests} transactions") - if self._storage_data_loader: - stats = self._storage_data_loader.stats - data_loader_info.append(f"Storage: {stats.total_events} events") - if self._window_property_data_loader: - stats = self._window_property_data_loader.stats - data_loader_info.append(f"Window: {stats.total_events} events") - if self._js_data_loader: - data_loader_info.append("JS files: available") - if self._documentation_data_loader: - summary = self._documentation_data_loader.stats.to_summary() - data_loader_info.append(f"Documentation: {summary}") - - if data_loader_info: - prompt_parts.append(f"\n\n## Data Sources\n{', '.join(data_loader_info)}") - - # Add current state - status = self._orchestration_state.get_queue_status() - prompt_parts.append(dedent(f"""\ - - ## Current State - - Phase: {self._discovery_state.phase.value} - - Pending tasks: {status['pending_tasks']} - - In-progress tasks: {status['in_progress_tasks']} - - Completed tasks: {status['completed_tasks']} - - Failed tasks: {status['failed_tasks']} - """)) - - # Add discovery state tracking info - discovery_status = self._discovery_state.get_queue_status() - if self._discovery_state.root_transaction or self._discovery_state.processed_transactions: - prompt_parts.append(dedent(f"""\ - - ## Discovery Progress - - Root transaction: {"Set" if self._discovery_state.root_transaction else "Not set"} - - Transaction queue: {discovery_status['pending_count']} pending, {discovery_status['processed_count']} processed - - Resolved variables: {len(self._discovery_state.all_resolved_variables)} - - Routine: {"Constructed" if self._discovery_state.production_routine else "Not constructed"} - """)) - - if self._remote_debugging_address: - prompt_parts.append("\n- Browser: Connected (validation available)") - else: - prompt_parts.append("\n- Browser: Not connected (skip validation)") - - return "".join(prompt_parts) - - ## Public API - - def run(self) -> Routine | None: - """ - Run the discovery to completion. - - Returns: - The discovered Routine, or None if discovery failed. - """ - # Seed the conversation with emphasis on delegation - initial_message = ( - f"TASK: {self._task}\n\n" - "IMPORTANT: Start by delegating to network_specialist to find the relevant endpoint. " - "Call create_task(agent_type='network_specialist', prompt='Find the API endpoint for: ') " - "then run_pending_tasks(). DO NOT manually browse transactions yourself." - ) - self._add_chat(ChatRole.USER, initial_message) - - # Run the main loop - for iteration in range(self._max_iterations): - logger.debug("RotutineDiscoveryBeta iteration %d/%d, phase: %s", - iteration + 1, self._max_iterations, self._discovery_state.phase.value) - - # Check for completion - if self._discovery_state.phase == DiscoveryPhase.COMPLETE: - return self._final_routine - - if self._discovery_state.phase == DiscoveryPhase.FAILED: - logger.error("Discovery failed: %s", self._failure_reason) - return None - - # Run agent loop iteration - messages = self._build_messages_for_llm() - try: - response = self._call_llm( - messages, - self._get_system_prompt(), - tool_choice="required", - ) - - if response.response_id: - self._previous_response_id = response.response_id - - if response.content or response.tool_calls: - chat = self._add_chat( - ChatRole.ASSISTANT, - response.content or "", - tool_calls=response.tool_calls if response.tool_calls else None, - llm_provider_response_id=response.response_id, - ) - if response.content: - self._emit_message( - ChatResponseEmittedMessage( - content=response.content, - chat_id=chat.id, - chat_thread_id=self._thread.id, - ) - ) - - if response.tool_calls: - self._process_tool_calls(response.tool_calls) - else: - # Prompt the agent to continue if no tool calls - provide phase-specific guidance - phase = self._discovery_state.phase - if phase == DiscoveryPhase.PLANNING: - guidance = ( - "Phase: PLANNING. You MUST delegate to specialists! " - "Call create_task(agent_type='network_specialist', prompt='Find the API endpoint for: ') " - "then run_pending_tasks(). DO NOT use list_transactions or get_transaction directly." - ) - elif phase == DiscoveryPhase.DISCOVERING: - task_status = self._orchestration_state.get_queue_status() - if task_status["pending_tasks"] > 0: - guidance = ( - f"Phase: DISCOVERING. You have {task_status['pending_tasks']} pending tasks. " - "Call run_pending_tasks() to execute them." - ) - elif task_status["completed_tasks"] > 0: - guidance = ( - "Phase: DISCOVERING. Tasks completed. Review results with get_task_result(task_id), " - "then record findings using record_identified_endpoint, record_extracted_variable. " - "For DYNAMIC_TOKENs, delegate to value_trace_resolver - don't use scan_for_value directly." - ) - else: - guidance = ( - "Phase: DISCOVERING. No tasks created yet! You MUST delegate: " - "create_task(agent_type='network_specialist', prompt='...') then run_pending_tasks(). " - "DO NOT manually inspect transactions - let specialists do the work." - ) - elif phase == DiscoveryPhase.CONSTRUCTING: - if not self._discovery_state.production_routine: - guidance = ( - "Phase: CONSTRUCTING. Call get_discovery_context to see all discovered data, " - "then use construct_routine to build the routine." - ) - else: - guidance = ( - "Phase: CONSTRUCTING. Routine already constructed. " - "Proceed to validation or mark as done." - ) - elif phase == DiscoveryPhase.VALIDATING: - guidance = ( - "Phase: VALIDATING. Review the construct_routine execution results. " - "If execution_success=True, call done. If execution_success=False, " - "fix the issues and call construct_routine again." - ) - else: - guidance = f"Phase: {phase.value}. Use tools to make progress." - - self._add_chat(ChatRole.SYSTEM, f"[ACTION REQUIRED] {guidance}") - - except Exception as e: - logger.exception("Error in RotutineDiscoveryBeta loop: %s", e) - self._emit_message(ErrorEmittedMessage(error=str(e))) - self._discovery_state.phase = DiscoveryPhase.FAILED - self._failure_reason = str(e) - return None - - logger.warning("RotutineDiscoveryBeta hit max iterations (%d)", self._max_iterations) - self._discovery_state.phase = DiscoveryPhase.FAILED - self._failure_reason = f"Max iterations ({self._max_iterations}) reached" - return None - - ## Internal methods - - def _get_or_create_agent(self, task: Task) -> AbstractSpecialist: - """Get existing agent instance or create new one for the task.""" - # Check if task specifies an existing agent - if task.agent_id and task.agent_id in self._agent_instances: - return self._agent_instances[task.agent_id] - - # Create new agent based on type - agent_type = task.agent_type - agent = self._create_specialist(agent_type) - - # Create SubAgent record and store instance - subagent = SubAgent( - type=agent_type, - llm_model=self._subagent_llm_model.value, - ) - self._orchestration_state.subagents[subagent.id] = subagent - self._agent_instances[subagent.id] = agent - - # Update task with agent_id - task.agent_id = subagent.id - subagent.task_ids.append(task.id) - - return agent - - def _create_specialist(self, agent_type: SpecialistAgentType) -> AbstractSpecialist: - """Create a specialist instance based on type.""" - if agent_type == SpecialistAgentType.JS_SPECIALIST: - return JSSpecialist( - emit_message_callable=self._emit_message_callable, - llm_model=self._subagent_llm_model, - documentation_data_loader=self._documentation_data_loader, - network_data_loader=self._network_data_loader, - js_data_loader=None, # NOTE: this is intentionally left None for now - remote_debugging_address=self._remote_debugging_address, - run_mode=RunMode.AUTONOMOUS, - ) - - elif agent_type == SpecialistAgentType.VALUE_TRACE_RESOLVER: - return ValueTraceResolverSpecialist( - emit_message_callable=self._emit_message_callable, - documentation_data_loader=self._documentation_data_loader, - network_data_loader=self._network_data_loader, - storage_data_loader=self._storage_data_loader, - window_property_data_loader=self._window_property_data_loader, - llm_model=self._subagent_llm_model, - run_mode=RunMode.AUTONOMOUS, - ) - - elif agent_type == SpecialistAgentType.NETWORK_SPECIALIST: - if not self._network_data_loader: - raise ValueError( - "network_specialist requires network_data_loader, " - "but it was not provided to RoutineDiscoveryAgentBeta" - ) - return NetworkSpecialist( - emit_message_callable=self._emit_message_callable, - llm_model=self._subagent_llm_model, - network_data_loader=self._network_data_loader, - documentation_data_loader=self._documentation_data_loader, - run_mode=RunMode.AUTONOMOUS, - ) - - elif agent_type == SpecialistAgentType.INTERACTION_SPECIALIST: - if not self._interaction_data_loader: - raise ValueError( - "interaction_specialist requires interaction_data_loader, " - "but it was not provided to RoutineDiscoveryAgentBeta" - ) - return InteractionSpecialist( - emit_message_callable=self._emit_message_callable, - interaction_data_loader=self._interaction_data_loader, - documentation_data_loader=self._documentation_data_loader, - llm_model=self._subagent_llm_model, - run_mode=RunMode.AUTONOMOUS, - ) - - else: - raise NotImplementedError( - f"Agent type {agent_type.value} is not yet supported. " - f"Available types: js_specialist, network_specialist, value_trace_resolver, interaction_specialist" - ) - - def _execute_task(self, task: Task) -> dict[str, Any]: - """Execute a task using the appropriate specialist.""" - task.status = TaskStatus.IN_PROGRESS - task.started_at = datetime.now() - - try: - agent = self._get_or_create_agent(task) - - # Calculate remaining loops - remaining_loops = task.max_loops - task.loops_used - if remaining_loops <= 0: - task.status = TaskStatus.FAILED - task.error = "No loops remaining" - return {"success": False, "error": "No loops remaining"} - - # Run autonomous with config - pass output schema here (not before) - # so it doesn't get cleared by _reset_autonomous_state() - config = AutonomousConfig( - min_iterations=1, # Allow immediate finalization for resumed tasks - max_iterations=remaining_loops, - ) - - result = agent.run_autonomous( - task=task.prompt, - config=config, - output_schema=task.output_schema, - output_description=task.output_description, - ) - - # Update loops used - task.loops_used += agent.autonomous_iteration - - if result is not None: - task.status = TaskStatus.COMPLETED - task.completed_at = datetime.now() - task.result = result.model_dump() if isinstance(result, BaseModel) else result - return {"success": True, "result": task.result} - else: - # Agent hit max iterations without finalizing - if task.loops_used < task.max_loops: - task.status = TaskStatus.PAUSED - return {"success": False, "status": "paused", "loops_used": task.loops_used} - else: - task.status = TaskStatus.FAILED - task.error = "Max loops reached without result" - return {"success": False, "error": task.error} - - except Exception as e: - task.status = TaskStatus.FAILED - task.error = str(e) - task.completed_at = datetime.now() - logger.error("Task %s failed: %s", task.id, e) - return {"success": False, "error": str(e)} - - def _validate_discovery_completeness(self) -> tuple[bool, list[str]]: - """ - Check if discovery state is complete enough to construct routine. - - Returns: - Tuple of (is_complete, list_of_blockers). - If is_complete is False, blockers explain what's missing. - """ - blockers = [] - - # Check if root transaction is set - if not self._discovery_state.root_transaction: - blockers.append("No root transaction recorded") - - # Check for unresolved dynamic tokens - unresolved_tokens = [] - for tx_id, tx_data in self._discovery_state.transaction_data.items(): - if tx_data.get("extracted_variables"): - extracted = tx_data["extracted_variables"] - resolved_names = { - rv.variable.name - for rv in tx_data.get("resolved_variables", []) - } - for var in extracted.variables: - if var.requires_dynamic_resolution and var.name not in resolved_names: - unresolved_tokens.append(f"{var.name} (in {tx_id})") - - if unresolved_tokens: - blockers.append(f"Unresolved dynamic tokens: {', '.join(unresolved_tokens)}") - - # Check if transaction queue is not empty - if self._discovery_state.transaction_queue: - blockers.append( - f"Transaction dependencies pending: {self._discovery_state.transaction_queue}" - ) - - is_complete = len(blockers) == 0 - return is_complete, blockers - - def _get_discovery_summary(self) -> str: - """ - Get a human-readable summary of the current discovery state. - - Returns: - Formatted string summarizing discovery progress. - """ - lines = [] - lines.append("=== Discovery State Summary ===") - - # Root transaction - if self._discovery_state.root_transaction: - root = self._discovery_state.root_transaction - lines.append(f"Root Transaction: {root.url} ({root.method.value})") - else: - lines.append("Root Transaction: Not set") - - # Transaction processing - status = self._discovery_state.get_queue_status() - lines.append( - f"Transactions: {status['processed_count']} processed, " - f"{status['pending_count']} pending" - ) - - # Variables - params = [ - rv.variable for rv in self._discovery_state.all_resolved_variables - if rv.variable.type == VariableType.PARAMETER - ] - tokens = [ - rv.variable for rv in self._discovery_state.all_resolved_variables - if rv.variable.type == VariableType.DYNAMIC_TOKEN - ] - statics = [ - rv.variable for rv in self._discovery_state.all_resolved_variables - if rv.variable.type == VariableType.STATIC_VALUE - ] - - lines.append(f"Parameters: {len(params)} ({', '.join(p.name for p in params) if params else 'none'})") - lines.append(f"Dynamic Tokens: {len(tokens)} ({', '.join(t.name for t in tokens) if tokens else 'none'})") - lines.append(f"Static Values: {len(statics)}") - - # Routine status - if self._discovery_state.production_routine: - routine = self._discovery_state.production_routine - lines.append( - f"Routine: Constructed ({len(routine.parameters)} params, " - f"{len(routine.operations)} operations)" - ) - else: - lines.append("Routine: Not constructed") - - # Completeness check - is_complete, blockers = self._validate_discovery_completeness() - if is_complete: - lines.append("Status: Ready to construct routine") - else: - lines.append(f"Status: Not ready - {'; '.join(blockers)}") - - return "\n".join(lines) - - ## Tools - Task Management - - # Available agent types for task creation - AVAILABLE_AGENT_TYPES = { - SpecialistAgentType.JS_SPECIALIST, - SpecialistAgentType.NETWORK_SPECIALIST, - SpecialistAgentType.VALUE_TRACE_RESOLVER, - SpecialistAgentType.INTERACTION_SPECIALIST, - } - - @agent_tool( - description="Create a new task for a specialist subagent (network_specialist, value_trace_resolver, js_specialist, interaction_specialist).", - parameters={ - "type": "object", - "properties": { - "agent_type": { - "type": "string", - "enum": ["network_specialist", "value_trace_resolver", "js_specialist", "interaction_specialist"], - "description": "Type of specialist agent" - }, - "prompt": { - "type": "string", - "description": "Task instructions for the specialist" - }, - "agent_id": { - "type": "string", - "description": "Optional ID of existing agent to reuse" - }, - "max_loops": { - "type": "integer", - "default": 15, - "description": "Maximum LLM iterations for this task" - }, - "output_schema": { - "type": "object", - "description": "JSON Schema defining expected output structure" - }, - "output_description": { - "type": "string", - "description": "Human-readable description of expected output" - }, - "context": { - "type": "object", - "description": "Additional context data for the specialist" - } - }, - "required": ["agent_type", "prompt"] - }, - availability=True, - ) - def _create_task( - self, - agent_type: str, - prompt: str, - agent_id: str | None = None, - max_loops: int = 15, - output_schema: dict[str, Any] | None = None, - output_description: str | None = None, - context: dict[str, Any] | None = None, - ) -> dict[str, Any]: - """ - Create a new task for a specialist subagent. - - Args: - agent_type: Type of specialist (js_specialist, network_specialist, value_trace_resolver, interaction_specialist). - prompt: Task instructions for the specialist. - agent_id: Optional ID of existing agent to reuse (preserves context). - max_loops: Maximum LLM iterations for this task (default 15). - output_schema: JSON Schema defining expected output structure. - output_description: Human-readable description of expected output. - context: Additional context data for the specialist. - """ - try: - parsed_type = SpecialistAgentType(agent_type) - except ValueError: - valid_types = [t.value for t in self.AVAILABLE_AGENT_TYPES] - return {"error": f"Invalid agent_type. Must be one of: {valid_types}"} - - if parsed_type not in self.AVAILABLE_AGENT_TYPES: - valid_types = [t.value for t in self.AVAILABLE_AGENT_TYPES] - return {"error": f"Agent type '{agent_type}' not available. Use: {valid_types}"} - - task = Task( - agent_type=parsed_type, - agent_id=agent_id, - prompt=prompt, - max_loops=max_loops, - output_schema=output_schema, - output_description=output_description, - context=context or {}, - ) - - self._orchestration_state.add_task(task) - self._discovery_state.phase = DiscoveryPhase.DISCOVERING - - result: dict[str, Any] = { - "success": True, - "task_id": task.id, - "agent_type": agent_type, - "message": "Task created. Use run_pending_tasks to execute.", - } - if output_schema: - result["output_schema_set"] = True - if output_description: - result["output_description_set"] = True - - return result - - @agent_tool( - description="List all tasks and their current status.", - parameters={"type": "object", "properties": {}, "required": []}, - availability=True, - ) - def _list_tasks(self) -> dict[str, Any]: - """List all tasks and their current status.""" - tasks_summary = [] - for task in self._orchestration_state.tasks.values(): - tasks_summary.append({ - "id": task.id, - "agent_type": task.agent_type, - "status": task.status.value, - "prompt": task.prompt[:100] + "..." if len(task.prompt) > 100 else task.prompt, - "loops_used": task.loops_used, - "max_loops": task.max_loops, - }) - - return { - "total": len(tasks_summary), - "pending": len(self._orchestration_state.get_pending_tasks()), - "in_progress": len(self._orchestration_state.get_in_progress_tasks()), - "completed": len(self._orchestration_state.get_completed_tasks()), - "failed": len(self._orchestration_state.get_failed_tasks()), - "tasks": tasks_summary, - } - - @agent_tool( - description="Get the result of a completed task.", - parameters={ - "type": "object", - "properties": { - "task_id": { - "type": "string", - "description": "The ID of the task to get results for" - } - }, - "required": ["task_id"] - }, - availability=True, - ) - def _get_task_result(self, task_id: str) -> dict[str, Any]: - """ - Get the result of a completed task. - - Args: - task_id: The ID of the task to get results for. - """ - task = self._orchestration_state.tasks.get(task_id) - if not task: - return {"error": f"Task {task_id} not found"} - - return { - "task_id": task.id, - "status": task.status.value, - "result": task.result, - "error": task.error, - "loops_used": task.loops_used, - } - - @agent_tool( - description="Execute all pending tasks and return their results.", - parameters={"type": "object", "properties": {}, "required": []}, - availability=True, - ) - def _run_pending_tasks(self) -> dict[str, Any]: - """Execute all pending tasks concurrently and return their results.""" - pending = self._orchestration_state.get_pending_tasks() - if not pending: - return {"message": "No pending tasks", "results": []} - - if len(pending) == 1: - # Single task — no threading overhead - task = pending[0] - result = self._execute_task(task) - results = [{"task_id": task.id, "agent_type": task.agent_type, **result}] - else: - # Multiple independent tasks — run in parallel - results = [] - with ThreadPoolExecutor(max_workers=len(pending)) as executor: - future_to_task = { - executor.submit(self._execute_task, task): task - for task in pending - } - for future in as_completed(future_to_task): - task = future_to_task[future] - try: - result = future.result() - except Exception as e: - logger.error("Task %s raised exception: %s", task.id, e) - result = {"success": False, "error": str(e)} - results.append({ - "task_id": task.id, - "agent_type": task.agent_type, - **result, - }) - # Preserve original task order for deterministic output - task_order = {task.id: i for i, task in enumerate(pending)} - results.sort( - key=lambda r: task_order.get(r["task_id"], 0) - ) - - # Check if all tasks are done and update phase - phase_message = None - if not self._orchestration_state.get_pending_tasks() and not self._orchestration_state.get_in_progress_tasks(): - if self._orchestration_state.get_failed_tasks(): - phase_message = "Some tasks failed. Review results and decide next steps." - else: - # All tasks completed successfully - # Check if we can transition to CONSTRUCTING - can_construct = True - construction_blockers = [] - - # Check if root transaction is set - if not self._discovery_state.root_transaction: - construction_blockers.append("No root transaction recorded (use record_identified_endpoint)") - - # Check if any unresolved dynamic tokens exist - unresolved_tokens = [] - for tx_id, tx_data in self._discovery_state.transaction_data.items(): - if tx_data.get("extracted_variables"): - extracted = tx_data["extracted_variables"] - resolved_names = { - rv.variable.name - for rv in tx_data.get("resolved_variables", []) - } - for var in extracted.variables: - if var.requires_dynamic_resolution and var.name not in resolved_names: - unresolved_tokens.append(var.name) - - if unresolved_tokens: - construction_blockers.append( - f"Unresolved dynamic tokens: {unresolved_tokens} " - f"(use value_trace_resolver and record_resolved_variable)" - ) - - # Check if transaction queue is not empty (dependencies pending) - if self._discovery_state.transaction_queue: - construction_blockers.append( - f"Transaction queue not empty: {self._discovery_state.transaction_queue} " - f"(process dependencies first)" - ) - - if construction_blockers: - can_construct = False - phase_message = ( - "All tasks completed, but cannot construct routine yet. Blockers: " + - "; ".join(construction_blockers) - ) - else: - # Can transition to CONSTRUCTING - self._discovery_state.phase = DiscoveryPhase.CONSTRUCTING - phase_message = ( - "All tasks completed and discovery is complete! " - "Use get_discovery_context to see all discovered data, " - "then construct_routine to build the routine." - ) - - result = { - "executed": len(results), - "results": results, - "phase": self._discovery_state.phase.value, - } - - if phase_message: - result["phase_message"] = phase_message - - return result - - ## Tools - Data Access - - @agent_tool( - description="[PREFER network_specialist] List transaction IDs. For finding the RIGHT endpoint, delegate to network_specialist instead - it can search semantically.", - parameters={"type": "object", "properties": {}, "required": []}, - availability=lambda self: self._network_data_loader is not None, - ) - def _list_transactions(self) -> dict[str, Any]: - """List all available transaction IDs from the network captures.""" - if not self._network_data_loader: - return {"error": "No network data store available"} - - entries = self._network_data_loader.entries - # Filter to likely-useful API entries (skip static assets) - static_extensions = ('.js', '.css', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.woff', '.woff2', '.ttf') - api_entries = [e for e in entries if not any(e.url.split('?')[0].endswith(ext) for ext in static_extensions)] - tx_summaries = [ - {"id": e.request_id, "method": e.method, "url": e.url[:100]} - for e in api_entries - ] - return { - "transactions": tx_summaries, - "count": len(entries), - "showing": len(tx_summaries), - "filtered_out": len(entries) - len(api_entries), - } - - @agent_tool( - description="Get full details of a transaction. Use AFTER network_specialist identifies the right transaction ID.", - parameters={ - "type": "object", - "properties": { - "transaction_id": { - "type": "string", - "description": "The ID of the transaction to retrieve" - } - }, - "required": ["transaction_id"] - }, - availability=lambda self: self._network_data_loader is not None, - ) - def _get_transaction(self, transaction_id: str) -> dict[str, Any]: - """ - Get full details of a transaction. - - Args: - transaction_id: The ID of the transaction to retrieve. - """ - if not self._network_data_loader: - return {"error": "No network data store available"} - - entry = self._network_data_loader.get_entry(transaction_id) - if not entry: - # Show some available IDs as hints - available = [e.request_id for e in self._network_data_loader.entries[:10]] - return {"error": f"Transaction {transaction_id} not found. Sample IDs: {available}"} - - max_body_len = 5_000 - response_body = entry.response_body - truncated = False - original_length = 0 - if response_body: - original_length = len(response_body) - if original_length > max_body_len: - response_body = response_body[:max_body_len] - truncated = True - - result: dict[str, Any] = { - "transaction_id": transaction_id, - "method": entry.method, - "url": entry.url, - "status": entry.status, - "request_headers": entry.request_headers, - "post_data": entry.post_data, - "response_headers": entry.response_headers, - "response_body": response_body, - } - if truncated: - result["response_body_truncated"] = True - result["response_body_full_length"] = original_length - result["response_body_note"] = ( - f"Response body truncated to {max_body_len} chars " - f"(full length: {original_length}). " - f"Delegate to network_specialist for full body search." - ) - return result - - @agent_tool( - description=( - "[PREFER value_trace_resolver SPECIALIST] Basic value search. " - "For DYNAMIC_TOKENs, delegate to value_trace_resolver instead - it has deeper analysis capabilities." - ), - parameters={ - "type": "object", - "properties": { - "value": { - "type": "string", - "description": "The value to search for" - }, - "exclude_transaction_id": { - "type": "string", - "description": "Transaction ID to exclude from search (usually the one containing the value)" - } - }, - "required": ["value"] - }, - availability=True, - ) - def _scan_for_value( - self, - value: str, - exclude_transaction_id: str | None = None - ) -> dict[str, Any]: - """ - Search for a value across all data sources. - - Args: - value: The value to search for. - exclude_transaction_id: Transaction ID to exclude from search. - """ - results: dict[str, Any] = { - "value": value, - "found_in": [], - } - - # Search network transactions - if self._network_data_loader: - for entry in self._network_data_loader.entries: - if exclude_transaction_id and entry.request_id == exclude_transaction_id: - continue - - # Search response body - if entry.response_body and value in entry.response_body: - results["found_in"].append({ - "source_type": "transaction", - "transaction_id": entry.request_id, - "location": "response_body", - "url": entry.url[:100], - }) - - # Search response headers - if entry.response_headers: - for header_name, header_value in entry.response_headers.items(): - if value in str(header_value): - results["found_in"].append({ - "source_type": "transaction", - "transaction_id": entry.request_id, - "location": f"response_header:{header_name}", - "url": entry.url[:100], - }) - - # Search request headers - if entry.request_headers: - for header_name, header_value in entry.request_headers.items(): - if value in str(header_value): - results["found_in"].append({ - "source_type": "transaction", - "transaction_id": entry.request_id, - "location": f"request_header:{header_name}", - "url": entry.url[:100], - }) - - # Search request body (post_data) - if entry.post_data: - post_data_str = entry.post_data if isinstance(entry.post_data, str) else json.dumps(entry.post_data) - if value in post_data_str: - results["found_in"].append({ - "source_type": "transaction", - "transaction_id": entry.request_id, - "location": "request_body", - "url": entry.url[:100], - }) - - # Search storage - if self._storage_data_loader: - for event in self._storage_data_loader.entries: - if hasattr(event, 'value') and event.value and value in str(event.value): - results["found_in"].append({ - "source_type": "storage", - "storage_type": event.storage_type if hasattr(event, 'storage_type') else "unknown", - "key": event.key if hasattr(event, 'key') else "unknown", - }) - - # Search window properties - if self._window_property_data_loader: - for event in self._window_property_data_loader.entries: - if hasattr(event, 'value') and event.value and value in str(event.value): - results["found_in"].append({ - "source_type": "window_property", - "path": event.path if hasattr(event, 'path') else "unknown", - }) - - results["total_matches"] = len(results["found_in"]) - return results - - ## Tools - State Population - - @agent_tool( - description="Record the main transaction identified (root transaction for routine).", - parameters={ - "type": "object", - "properties": { - "request_id": { - "type": "string", - "description": "The transaction ID (HAR entry ID)" - }, - "url": { - "type": "string", - "description": "The URL of the endpoint" - }, - "method": { - "type": "string", - "enum": ["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"], - "description": "HTTP method" - }, - "description": { - "type": "string", - "description": "What this transaction does" - } - }, - "required": ["request_id", "url", "method", "description"] - }, - availability=lambda self: self._network_data_loader is not None, - ) - def _record_identified_endpoint( - self, - request_id: str, - url: str, - method: str, - description: str - ) -> dict[str, Any]: - """ - Record the main transaction identified by network_specialist. - This becomes the root_transaction in discovery state. - - Args: - request_id: The HAR entry ID from network_specialist results. - url: The URL of the endpoint. - method: HTTP method (GET, POST, etc). - description: What this transaction does. - """ - # Validate request_id exists in network data - if not self._network_data_loader: - return {"error": "No network data loader available"} - - entry = self._network_data_loader.get_entry(request_id) - if not entry: - available_ids = [e.request_id for e in self._network_data_loader.entries[:10]] - return { - "error": f"Request ID '{request_id}' not found", - "sample_ids": available_ids - } - - # Parse HTTP method - try: - http_method = HTTPMethod(method.upper()) - except ValueError: - return {"error": f"Invalid HTTP method '{method}'. Use GET, POST, PUT, DELETE, etc."} - - # Create TransactionIdentificationResponse - root_transaction = TransactionIdentificationResponse( - transaction_id=request_id, - description=description, - url=url, - method=http_method, - short_explanation=f"Main endpoint for {description}" - ) - - # Store in discovery state - self._discovery_state.root_transaction = root_transaction - - # Add to transaction queue - added, position = self._discovery_state.add_to_queue(request_id) - - # Initialize transaction data - self._discovery_state.store_transaction_data( - transaction_id=request_id, - request={ - "url": entry.url, - "method": entry.method, - "headers": entry.request_headers, - "body": entry.post_data, - } - ) - - # Transition to DISCOVERING phase - self._discovery_state.phase = DiscoveryPhase.DISCOVERING - - return { - "success": True, - "transaction_id": request_id, - "added_to_queue": added, - "queue_position": position, - "message": f"Recorded root transaction: {url}" - } - - @agent_tool( - description="Record a variable discovered from analyzing a transaction (parameter, dynamic_token, or static_value).", - parameters={ - "type": "object", - "properties": { - "transaction_id": { - "type": "string", - "description": "The transaction this variable belongs to" - }, - "name": { - "type": "string", - "description": "Variable name (e.g., 'origin_city', 'x-trace-id')" - }, - "type": { - "type": "string", - "enum": ["parameter", "dynamic_token", "static_value"], - "description": "Variable type" - }, - "observed_value": { - "type": "string", - "description": "The actual value seen in the capture" - }, - "requires_dynamic_resolution": { - "type": "boolean", - "description": "True if value must be resolved at runtime" - }, - "values_to_scan_for": { - "type": "array", - "items": {"type": "string"}, - "description": "Optional list of values to search for" - } - }, - "required": ["transaction_id", "name", "type", "observed_value", "requires_dynamic_resolution"] - }, - availability=lambda self: self._discovery_state.root_transaction is not None, - ) - def _record_extracted_variable( - self, - transaction_id: str, - name: str, - type: str, - observed_value: str, - requires_dynamic_resolution: bool, - values_to_scan_for: list[str] | None = None - ) -> dict[str, Any]: - """ - Record a variable discovered from analyzing a transaction. - - Args: - transaction_id: The transaction this variable belongs to. - name: Variable name (e.g., "origin_city", "x-trace-id"). - type: Variable type - "parameter", "dynamic_token", or "static_value". - observed_value: The actual value seen in the capture. - requires_dynamic_resolution: True if value must be resolved at runtime. - values_to_scan_for: Optional list of values to search for (defaults to [observed_value]). - """ - # Validate variable type - try: - var_type = VariableType(type) - except ValueError: - return { - "error": f"Invalid variable type '{type}'. Use: parameter, dynamic_token, or static_value" - } - - # Create Variable object - variable = Variable( - type=var_type, - requires_dynamic_resolution=requires_dynamic_resolution, - name=name, - observed_value=observed_value, - values_to_scan_for=values_to_scan_for or [observed_value] - ) - - # Check if transaction_data exists for this transaction - if transaction_id not in self._discovery_state.transaction_data: - self._discovery_state.transaction_data[transaction_id] = { - "request": None, - "extracted_variables": None, - "resolved_variables": [] - } - - # Get or create ExtractedVariableResponse - tx_data = self._discovery_state.transaction_data[transaction_id] - if tx_data.get("extracted_variables") is None: - extracted = ExtractedVariableResponse( - transaction_id=transaction_id, - variables=[variable] - ) - tx_data["extracted_variables"] = extracted - else: - # Add to existing variables - tx_data["extracted_variables"].variables.append(variable) - - return { - "success": True, - "transaction_id": transaction_id, - "variable_name": name, - "variable_type": type, - "requires_resolution": requires_dynamic_resolution, - "message": f"Recorded variable '{name}' for transaction {transaction_id}" - } - - @agent_tool( - description="Record how to resolve a dynamic token (storage, window_property, or transaction source). Auto-adds dependency transactions.", - parameters={ - "type": "object", - "properties": { - "variable_name": { - "type": "string", - "description": "Name of the variable being resolved" - }, - "transaction_id": { - "type": "string", - "description": "The transaction this variable belongs to" - }, - "source_type": { - "type": "string", - "enum": ["storage", "window_property", "transaction"], - "description": "Where the value comes from" - }, - "storage_source": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["cookie", "localStorage", "sessionStorage"] - }, - "dot_path": {"type": "string"} - }, - "description": "For storage source" - }, - "window_property_source": { - "type": "object", - "properties": { - "dot_path": {"type": "string"} - }, - "description": "For window source" - }, - "transaction_source": { - "type": "object", - "properties": { - "transaction_id": {"type": "string"}, - "dot_path": {"type": "string"} - }, - "description": "For transaction source" - } - }, - "required": ["variable_name", "transaction_id", "source_type"] - }, - availability=lambda self: self._discovery_state.root_transaction is not None, - ) - def _record_resolved_variable( - self, - variable_name: str, - transaction_id: str, - source_type: str, - storage_source: dict[str, str] | None = None, - window_property_source: dict[str, str] | None = None, - transaction_source: dict[str, str] | None = None, - ) -> dict[str, Any]: - """ - Record how to resolve a dynamic token. - - Args: - variable_name: Name of the variable being resolved. - transaction_id: The transaction this variable belongs to. - source_type: Where the value comes from ("storage", "window_property", "transaction"). - storage_source: For storage source - {"type": "cookie|localStorage|sessionStorage", "dot_path": "path"}. - window_property_source: For window property source - {"dot_path": "path"}. - transaction_source: For transaction source - {"transaction_id": "id", "dot_path": "path"}. - """ - # Get the variable from extracted variables - tx_data = self._discovery_state.transaction_data.get(transaction_id) - if not tx_data or not tx_data.get("extracted_variables"): - return {"error": f"No extracted variables found for transaction {transaction_id}"} - - extracted = tx_data["extracted_variables"] - variable = None - for var in extracted.variables: - if var.name == variable_name: - variable = var - break - - if not variable: - available = [v.name for v in extracted.variables] - return { - "error": f"Variable '{variable_name}' not found in transaction {transaction_id}", - "available_variables": available - } - - # Build the source object based on source_type - source = None - dependency_added = False - - if source_type == "storage": - if not storage_source: - return {"error": "storage_source required for source_type='storage'"} - try: - storage_type = SessionStorageType(storage_source["type"]) - except (KeyError, ValueError): - return {"error": "storage_source must have 'type' (cookie, localStorage, sessionStorage) and 'dot_path'"} - source = SessionStorageSource( - type=storage_type, - dot_path=storage_source.get("dot_path", "") - ) - - elif source_type == "window_property": - if not window_property_source: - return {"error": "window_property_source required for source_type='window_property'"} - source = WindowPropertySource( - dot_path=window_property_source.get("dot_path", "") - ) - - elif source_type == "transaction": - if not transaction_source: - return {"error": "transaction_source required for source_type='transaction'"} - source_tx_id = transaction_source.get("transaction_id") - if not source_tx_id: - return {"error": "transaction_source must have 'transaction_id' and 'dot_path'"} - - dot_path = transaction_source.get("dot_path", "") - - # Validate that dot_path resolves in the source transaction's response - if dot_path and self._network_data_loader: - source_entry = self._network_data_loader.get_entry(source_tx_id) - if source_entry and source_entry.response_body: - resolved_value = resolve_dotted_path(logger, source_entry.response_body, dot_path) - if resolved_value is None: - return { - "error": ( - f"dot_path '{dot_path}' does not resolve to a value in transaction {source_tx_id}'s " - "response body. Verify the path is correct." - ) - } - - source = TransactionSource( - transaction_id=source_tx_id, - dot_path=dot_path, - ) - - # Auto-add dependency transaction to queue - added, position = self._discovery_state.add_to_queue(source_tx_id) - if added: - dependency_added = True - # Initialize transaction data for dependency if not exists - if source_tx_id not in self._discovery_state.transaction_data: - entry = self._network_data_loader.get_entry(source_tx_id) if self._network_data_loader else None - if entry: - self._discovery_state.store_transaction_data( - transaction_id=source_tx_id, - request={ - "url": entry.url, - "method": entry.method, - "headers": entry.request_headers, - "body": entry.post_data, - } - ) - - else: - return {"error": f"Invalid source_type '{source_type}'. Use: storage, window_property, transaction"} - - # Create ResolvedVariableResponse - resolved = ResolvedVariableResponse( - variable=variable, - source=source - ) - - # Store in transaction data - if "resolved_variables" not in tx_data: - tx_data["resolved_variables"] = [] - tx_data["resolved_variables"].append(resolved) - - result = { - "success": True, - "variable_name": variable_name, - "source_type": source_type, - "message": f"Recorded resolution for '{variable_name}'" - } - - if dependency_added: - result["dependency_added"] = source_tx_id - result["message"] += f" (dependency transaction {source_tx_id} added to queue)" - - return result - - @agent_tool( - description="Mark a transaction as fully processed (all variables extracted and resolved). Removes from queue.", - parameters={ - "type": "object", - "properties": { - "transaction_id": { - "type": "string", - "description": "The transaction ID to mark as processed" - } - }, - "required": ["transaction_id"] - }, - availability=lambda self: self._discovery_state.root_transaction is not None, - ) - def _mark_transaction_processed(self, transaction_id: str) -> dict[str, Any]: - """ - Mark a transaction as fully processed. - - Call this when you've extracted all variables and resolved all dynamic tokens - for a transaction. This removes it from the queue and adds it to processed list. - - Args: - transaction_id: The transaction ID to mark as processed. - """ - # Check if transaction exists in our data - if transaction_id not in self._discovery_state.transaction_data: - return {"error": f"Transaction {transaction_id} not found in discovery data"} - - # Check for unresolved dynamic tokens - tx_data = self._discovery_state.transaction_data[transaction_id] - unresolved = [] - if tx_data.get("extracted_variables"): - resolved_names = { - rv.variable.name - for rv in tx_data.get("resolved_variables", []) - } - for var in tx_data["extracted_variables"].variables: - if var.requires_dynamic_resolution and var.name not in resolved_names: - unresolved.append(var.name) - - if unresolved: - return { - "error": f"Cannot mark as processed - unresolved dynamic tokens: {unresolved}", - "hint": "Use scan_for_value and record_resolved_variable for each token first" - } - - # Remove from queue if present - if transaction_id in self._discovery_state.transaction_queue: - self._discovery_state.transaction_queue.remove(transaction_id) - - # Mark as processed - self._discovery_state.mark_transaction_complete(transaction_id) - - # Get next transaction in queue - queue_status = self._discovery_state.get_queue_status() - - return { - "success": True, - "transaction_id": transaction_id, - "message": f"Transaction {transaction_id} marked as processed", - "remaining_queue": queue_status["pending"], - "processed_count": queue_status["processed_count"], - } - - @agent_tool() - def _get_discovery_context(self) -> dict[str, Any]: - """Get complete discovery context for routine construction.""" - # Build CRITICAL observed values reminder - this goes at the TOP - observed_values_for_params: dict[str, str] = {} - for tx_id, tx_data in self._discovery_state.transaction_data.items(): - if tx_data.get("extracted_variables"): - for var in tx_data["extracted_variables"].variables: - if var.type == VariableType.PARAMETER and var.observed_value: - observed_values_for_params[var.name] = var.observed_value - - context: dict[str, Any] = { - "phase": self._discovery_state.phase.value, - "CRITICAL_OBSERVED_VALUES": { - "message": "YOU MUST INCLUDE THESE observed_value FIELDS WHEN CONSTRUCTING ROUTINE PARAMETERS!", - "parameters_with_observed_values": observed_values_for_params, - }, - "root_transaction": None, - "processed_transactions": [], - "all_variables": { - "parameters": [], - "dynamic_tokens": [], - "static_values": [], - }, - "resolution_map": {}, - "summary": self._get_discovery_summary(), - } - - # Root transaction - if self._discovery_state.root_transaction: - root = self._discovery_state.root_transaction - context["root_transaction"] = { - "transaction_id": root.transaction_id, - "url": root.url, - "method": root.method.value, - "description": root.description, - } - - # Process all transaction data - for tx_id, tx_data in self._discovery_state.transaction_data.items(): - tx_summary = { - "transaction_id": tx_id, - "request": tx_data.get("request"), - "variables": [], - } - - if tx_data.get("extracted_variables"): - for var in tx_data["extracted_variables"].variables: - var_info = { - "name": var.name, - "type": var.type.value, - "observed_value": var.observed_value, - "requires_resolution": var.requires_dynamic_resolution, - } - tx_summary["variables"].append(var_info) - - # Categorize by type - if var.type == VariableType.PARAMETER: - context["all_variables"]["parameters"].append(var_info) - elif var.type == VariableType.DYNAMIC_TOKEN: - context["all_variables"]["dynamic_tokens"].append(var_info) - else: - context["all_variables"]["static_values"].append(var_info) - - # Add resolution info - if tx_data.get("resolved_variables"): - for resolved in tx_data["resolved_variables"]: - source_info = {} - if isinstance(resolved.source, SessionStorageSource): - source_info = { - "type": "storage", - "storage_type": resolved.source.type.value, - "dot_path": resolved.source.dot_path, - } - elif isinstance(resolved.source, WindowPropertySource): - source_info = { - "type": "window_property", - "dot_path": resolved.source.dot_path, - } - elif isinstance(resolved.source, TransactionSource): - source_info = { - "type": "transaction", - "transaction_id": resolved.source.transaction_id, - "dot_path": resolved.source.dot_path, - } - - context["resolution_map"][resolved.variable.name] = source_info - - context["processed_transactions"].append(tx_summary) - - # Completeness check - is_complete, blockers = self._validate_discovery_completeness() - context["is_complete"] = is_complete - context["blockers"] = blockers - - return context - - ## Tools - Routine Construction - - @agent_tool( - description="Construct a routine from discovered data. After constructing, use validate_routine to test it.", - parameters={ - "type": "object", - "properties": { - "routine": { - "type": "object", - "description": "The routine to construct.", - "properties": { - "name": {"type": "string", "description": "Routine name"}, - "description": {"type": "string", "description": "What the routine does"}, - "parameters": { - "type": "array", - "description": "Input parameters. Each needs: name, type (string|number|boolean|date|enum), description.", - "items": {"type": "object"}, - }, - "operations": { - "type": "array", - "description": ( - "Ordered operations. Each needs a 'type' field: " - "navigate|fetch|return|sleep|click|input_text|press|" - "wait_for_url|scroll|get_cookies|download|return_html|js_evaluate. " - "Key schemas — navigate: {type, url}. " - "fetch: {type, endpoint: {url, method, headers?, body?}, session_storage_key}. " - "return: {type, session_storage_key, tables?}. " - "Use {{paramName}} placeholders in URLs/bodies for parameters." - ), - "items": {"type": "object"}, - }, - }, - "required": ["name", "description", "parameters", "operations"], - }, - }, - "required": ["routine"], - }, - availability=lambda self: ( - self._discovery_state.root_transaction is not None and - not self._discovery_state.transaction_queue - ), - ) - def _construct_routine( - self, - routine: dict[str, Any], - ) -> dict[str, Any]: - """ - Construct a routine from discovered data (no execution). - - After constructing, use validate_routine to test it with parameters. - - Args: - routine: The routine dict with name, description, parameters, and operations. - """ - self._discovery_state.phase = DiscoveryPhase.CONSTRUCTING - self._discovery_state.construction_attempts += 1 - - # Reset validation state when routine is (re)constructed - self._discovery_state.last_validation_result = None - self._discovery_state.validation_analyzed = False - self._discovery_state.last_analysis = None - - try: - routine_obj = Routine.model_validate(routine) - except Exception as e: - return { - "error": f"Invalid routine structure: {e}", - "message": "Failed to parse routine. Check schema in the docs and try again.", - } - - # Get structure warnings (errors are already caught by model validation above) - structure_warnings = routine_obj.get_structure_warnings() - - try: - self._discovery_state.production_routine = routine_obj - - return { - "success": True, - "routine_name": routine_obj.name, - "parameter_count": len(routine_obj.parameters), - "operation_count": len(routine_obj.operations), - "warnings": structure_warnings, - "message": "Routine constructed. Now use validate_routine with test_parameters to execute and verify it works.", - } - - except Exception as e: - return { - "error": str(e), - "message": "Failed to construct routine. Check schema in the docs and try again.", - } - - @agent_tool( - description=( - "Execute the constructed routine with test parameters to validate it works. " - "Only available when browser is connected." - ), - parameters={ - "type": "object", - "properties": { - "test_parameters": { - "type": "object", - "description": ( - "Test parameter values from observed data. " - "Map of parameter_name -> observed_value. " - "Example: {\"origin\": \"NYC\", \"destination\": \"BOS\"}. " - "Get these from the extracted variables' observed_value fields." - ), - "additionalProperties": {"type": "string"}, - }, - }, - "required": ["test_parameters"], - }, - availability=lambda self: ( - self._discovery_state.production_routine is not None and - self._remote_debugging_address is not None # Require browser connection - ), - ) - def _validate_routine( - self, - test_parameters: dict[str, str], - ) -> dict[str, Any]: - """ - Execute the constructed routine with test parameters to validate it works. - - After validation, use analyze_validation to reflect on results before calling done. - - Args: - test_parameters: Map of parameter names to observed values for testing. - """ - if not self._discovery_state.production_routine: - return {"error": "No routine constructed. Use construct_routine first."} - - self._discovery_state.phase = DiscoveryPhase.VALIDATING - self._discovery_state.validation_attempts += 1 - - # Store test_parameters in discovery state - self._discovery_state.test_parameters = test_parameters - - # Reset analysis state - self._discovery_state.validation_analyzed = False - self._discovery_state.last_analysis = None - - routine_obj = self._discovery_state.production_routine - - # Import here to avoid circular dependency - from bluebox.llms.tools.execute_routine_tool import execute_routine - - result = execute_routine( - routine=routine_obj.model_dump(), - parameters=test_parameters, - remote_debugging_address=self._remote_debugging_address, - timeout=60, - close_tab_when_done=True, - ) - - # Store full result for analysis - if result.get("success"): - exec_result = result.get("result") - self._discovery_state.last_validation_result = { - "success": True, - "exec_result": exec_result.model_dump() if exec_result else None, - "data_returned": exec_result.data is not None if exec_result else False, - } - - if exec_result and exec_result.ok and exec_result.data is not None: - return { - "routine_name": routine_obj.name, - "execution_success": True, - "data_returned": True, - "data_preview": str(exec_result.data)[:500], - "message": "Routine executed successfully with data. Use analyze_validation to reflect on results.", - } - else: - return { - "routine_name": routine_obj.name, - "execution_success": True, - "data_returned": False, - "exec_result": exec_result.model_dump() if exec_result else None, - "message": ( - "Routine executed but 'data' field is missing or empty. " - "Use analyze_validation to decide next steps." - ), - } - else: - self._discovery_state.last_validation_result = { - "success": False, - "error": result.get("error", "Unknown error"), - } - return { - "routine_name": routine_obj.name, - "execution_success": False, - "error": result.get("error", "Unknown error"), - "message": "Routine execution failed. Use analyze_validation to decide next steps.", - } - - @agent_tool( - description="Analyze validation results and decide next steps. REQUIRED before calling done().", - parameters={ - "type": "object", - "properties": { - "analysis": { - "type": "string", - "description": "Your analysis of what worked and what failed in the validation.", - }, - "data_matches_task": { - "type": "boolean", - "description": "Does the returned data accomplish the original task the user requested?", - }, - "next_action": { - "type": "string", - "enum": ["done", "fix_routine", "retry_validation"], - "description": ( - "What to do next: 'done' if successful, 'fix_routine' to modify routine, " - "'retry_validation' to re-run." - ), - }, - }, - "required": ["analysis", "data_matches_task", "next_action"], - }, - availability=lambda self: ( - self._discovery_state.last_validation_result is not None and - not self._discovery_state.validation_analyzed - ), - ) - def _analyze_validation( - self, - analysis: str, - data_matches_task: bool, - next_action: str, - ) -> dict[str, Any]: - """ - Analyze validation results and decide next steps. Required before calling done(). - - Args: - analysis: Your analysis of what worked and what failed. - data_matches_task: Does the returned data accomplish the original task? - next_action: What to do next - 'done', 'fix_routine', or 'retry_validation'. - """ - if self._discovery_state.last_validation_result is None: - return {"error": "No validation result to analyze. Use validate_routine first."} - - # Validate next_action - valid_actions = ["done", "fix_routine", "retry_validation"] - if next_action not in valid_actions: - return {"error": f"Invalid next_action. Must be one of: {valid_actions}"} - - # Store the analysis - self._discovery_state.last_analysis = { - "analysis": analysis, - "data_matches_task": data_matches_task, - "next_action": next_action, - } - self._discovery_state.validation_analyzed = True - - # Check for inconsistency: can't say "done" if data doesn't match task - if next_action == "done" and not data_matches_task: - return { - "error": "Inconsistent analysis: next_action is 'done' but data_matches_task is False.", - "message": "If data doesn't match the task, you must fix the routine first.", - "hint": "Set next_action to 'fix_routine' and update the routine to return correct data.", - } - - # Check validation result - validation_result = self._discovery_state.last_validation_result - validation_failed = not validation_result.get("success", False) and not validation_result.get("skipped", False) - - if next_action == "done" and validation_failed: - return { - "error": "Cannot mark as done when validation failed.", - "message": "Fix the routine and re-validate before completing.", - } - - # Return guidance based on next_action - if next_action == "done": - return { - "success": True, - "message": "Analysis recorded. You may now call done() to complete discovery.", - "analysis_summary": { - "analysis": analysis, - "data_matches_task": data_matches_task, - }, - } - elif next_action == "fix_routine": - return { - "success": True, - "message": "Analysis recorded. Use construct_routine to fix the routine, then validate_routine again.", - "analysis_summary": { - "analysis": analysis, - "data_matches_task": data_matches_task, - }, - } - else: # retry_validation - # Reset for retry - self._discovery_state.validation_analyzed = False - self._discovery_state.last_analysis = None - return { - "success": True, - "message": "Analysis recorded. Use validate_routine to retry validation.", - "analysis_summary": { - "analysis": analysis, - "data_matches_task": data_matches_task, - }, - } - - ## Tools - Completion - - def _can_complete(self) -> bool: - """Check if discovery can be marked complete.""" - # Must have a routine - if not self._discovery_state.production_routine: - return False - - # If no browser connected, can complete without validation - # (we can't execute routines without a browser) - if not self._remote_debugging_address: - return True - - # With browser: must have validated and analyzed successfully - if not self._discovery_state.validation_analyzed: - return False - - analysis = self._discovery_state.last_analysis - if not analysis: - return False - - return analysis.get("data_matches_task", False) - - @agent_tool( - availability=lambda self: self._can_complete(), - ) - def _done(self) -> dict[str, Any]: - """Mark discovery as complete. Available after construct_routine (no browser) or successful analyze_validation (with browser).""" - if not self._discovery_state.production_routine: - return {"error": "No routine constructed. Use construct_routine first."} - - # If browser connected, require successful validation analysis - if self._remote_debugging_address: - if not self._discovery_state.validation_analyzed: - return {"error": "Validation not analyzed. Use validate_routine then analyze_validation first."} - - analysis = self._discovery_state.last_analysis - if not analysis: - return {"error": "No analysis found. Use analyze_validation first."} - - if not analysis.get("data_matches_task", False): - return { - "error": "Cannot complete when data doesn't match task.", - "message": "Fix the routine with construct_routine, then validate_routine and analyze_validation again.", - } - - self._discovery_state.phase = DiscoveryPhase.COMPLETE - self._final_routine = self._discovery_state.production_routine - - # Note if routine was not validated - message = "Discovery completed" - if not self._remote_debugging_address: - message += " (routine not validated - no browser connected)" - - return { - "success": True, - "message": message, - "routine_name": self._final_routine.name, - } - - @agent_tool( - availability=lambda self: ( - self._discovery_state.root_transaction is None - or self._discovery_state.construction_attempts >= 5 - ), - ) - def _fail(self, reason: str) -> dict[str, Any]: - """ - Mark discovery as failed. - - Args: - reason: Why discovery could not be completed. - """ - self._discovery_state.phase = DiscoveryPhase.FAILED - self._failure_reason = reason - return { - "success": False, - "message": "Discovery marked as failed", - "reason": reason, - } diff --git a/bluebox/agents/routine_inspector.py b/bluebox/agents/routine_inspector.py new file mode 100644 index 00000000..3f0aa9ee --- /dev/null +++ b/bluebox/agents/routine_inspector.py @@ -0,0 +1,314 @@ +""" +bluebox/agents/routine_inspector.py + +RoutineInspector — independent quality gate for constructed routines. + +The inspector receives ALL context in the task prompt and returns a structured +RoutineInspectionResult. It has no knowledge of the discovery process — it +judges the OUTPUT, not the PROCESS. When equipped with documentation tools, +it can search common-issues docs to provide specific remediation advice. + +Think of it as a peer reviewer: reads the routine cold, checks if the claims +hold up, and decides: publish, revise, or reject. +""" + +from __future__ import annotations + +from textwrap import dedent +from typing import Callable, TYPE_CHECKING + +from bluebox.agents.abstract_agent import AbstractAgent, AgentCard +from bluebox.workspace import AgentWorkspace +from bluebox.data_models.llms.interaction import ( + Chat, + ChatThread, + EmittedMessage, +) +from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel +from bluebox.data_models.orchestration.inspection import RoutineInspectionResult +from bluebox.data_models.orchestration.result import SpecialistResultWrapper +from bluebox.utils.logger import get_logger + +if TYPE_CHECKING: + from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader + +logger = get_logger(name=__name__) + + +class RoutineInspector(AbstractAgent): + """ + Independent quality gate for constructed routines. + + Receives routine + execution result + exploration context as the task prompt, + scores on 6 dimensions, and returns a RoutineInspectionResult via + finalize_with_output. Has optional access to documentation tools to provide + specific, actionable remediation advice in recommendations. + """ + + AGENT_CARD = AgentCard( + description=( + "Independent quality gate that judges constructed routines on 6 dimensions: " + "task completion, data quality, parameter coverage, routine robustness, " + "structural correctness, and documentation quality. Can reference routine " + "documentation to provide actionable fix recommendations." + ), + ) + SYSTEM_PROMPT: str = dedent("""\ + You are a routine quality inspector. You judge routines objectively. + + You have NO knowledge of how the routine was built. You only see: + - The user's task + - The routine JSON + - The execution result (if available) + - Exploration summaries (what the site looks like) + + Your job: score the routine and decide if it ships. + """) + + AUTONOMOUS_SYSTEM_PROMPT: str = dedent("""\ + You are an independent routine quality inspector. You receive a routine + and must judge whether it correctly accomplishes its own stated purpose + (name + description). Do NOT judge it against any broader project goal — + only against what the routine itself claims to do. + + ## CRITICAL: Judge ACTUAL Results, Not Hypotheticals + + You score based on WHAT ACTUALLY HAPPENED, not what "would work if...". + If the execution returned a 401, the routine FAILED. Period. You do not + get to say "it would return rich data with valid credentials" — that is + speculation, not inspection. A routine that doesn't work doesn't ship. + + **Automatic failure signals (ANY of these → task_completion ≤ 2, data_quality ≤ 2):** + - HTTP 4xx or 5xx status codes in ANY operation response + - Unresolved placeholders (e.g. "Could not resolve placeholder: ...") + - Error messages in the response body (e.g. "Access denied", "Unauthorized", + "Invalid", "Forbidden", "Not found") + - Test parameters containing obvious placeholder values like "REPLACE_WITH_...", + "YOUR_..._HERE", "TODO", "FIXME" — this means the routine can't be tested + - Empty or null response data when the routine promises to return something + - The execution_result.data containing an error object instead of real data + + **You are a quality gate, not a cheerleader.** Your job is to BLOCK bad routines + from shipping. If you let a broken routine through, it pollutes the database + and wastes other agents' time. When in doubt, FAIL it. + + ## CRITICAL: Spec Description Downgrade Detection + + When the inspection prompt includes a "Spec vs Routine Description Comparison" + section, you MUST check whether the routine's own description has been watered + down from the original spec. If the spec promises rich, detailed data but the + routine description claims to return only minimal fields, this is a BLOCKING issue: + + - Add blocking issue: "Routine description is significantly weaker than the spec + description. Spec promises: ''. Routine claims: ''. + The routine must deliver on the original spec or the spec should be updated." + - Cap task_completion at 4 — the routine may work for what it claims, but it + does NOT fulfill the originally planned capability. + - Cap data_quality at 4 — returning 2 fields when 15 were promised is not + quality data. + + ## Scoring Rubric (6 dimensions, 0-10 each) + + 1. **Task Completion** — Does the returned data ACTUALLY accomplish what + the routine's name and description promise? Check the REAL execution result. + - Did the routine return the data it claims to return? Not "could it" — DID IT? + - A flight search that returned a 401 error did NOT return flights → score 0-2 + - A standings routine that returned an HTML error page did NOT return standings → score 0-2 + - ONLY score above 5 if the execution result contains ACTUAL meaningful data + that matches what the routine promises + + 2. **Data Quality** — Is the ACTUAL response complete and meaningful? + - Check the REAL response data, not what you imagine it could contain + - A 401/403/500 response has ZERO data quality regardless of how "correct" + the request structure looks → score 0-2 + - An error message body is not "data" → score 0 + - Truncated, empty, or missing data → score 0-3 + - ONLY score above 5 if the response contains REAL, COMPLETE, MEANINGFUL data + + 3. **Parameter Coverage** — Are the right values parameterized? Any hardcoded + values that should be params (dates, search terms, IDs)? Any unnecessary + params that could be hardcoded? + + 4. **Routine Robustness** — Would this work in a fresh session? Are dynamic + tokens properly resolved via placeholders (not hardcoded expired values)? + Does it handle auth correctly (navigate first to establish cookies/tokens + before making API calls)? + - If any placeholder failed to resolve → score ≤ 4 + - If auth tokens are not properly obtained → score ≤ 3 + + 5. **Structural Correctness** — Navigate before fetch? Dependencies before + dependents? Consistent session_storage_key usage (write before read)? + Valid placeholder types? Operations in correct order? + + 6. **Documentation Quality** — CRITICAL: These routines will be vectorized and + stored in databases for other agents to discover via semantic search. + Score strictly: + + **Routine name** (0-3 points): + - Must be snake_case with verb_site_noun pattern, ≥3 segments + - MUST include the site/service name so the name makes sense in isolation + to an agent that has never seen this routine before + - GOOD: get_premierleague_standings, search_amtrak_trains, fetch_espn_scores + - BAD: get_standings (from where?), get_content_item (what content? what site?), + fetch_data (completely generic), search_matches (which sport? which site?) + - 0 = missing/generic/no site context, 1 = has site but vague noun, + 2 = decent with site + noun, 3 = precise verb_site_noun with clear specificity + + **Routine description** (0-4 points): + - Must be ≥8 words + - Must explain: (a) what it does, (b) what inputs it takes, (c) what data it returns + - Example of 4/4: "Fetches Premier League standings for a given competition ID + and season ID, returning team names, positions, points, and goal difference." + - 0 = missing/useless, 1 = says what it does only, 2 = adds inputs, 3 = adds outputs, 4 = complete + + **Parameter descriptions** (0-3 points): + - Every parameter must have a description of ≥3 words + - Should explain what the value represents AND its expected format/range + - CRITICAL for non-obvious parameters (opaque IDs, slugs, codes, UUIDs): + The description MUST explain WHERE to get the value. If the user can't + google it, the description must say how to obtain it — e.g. which other + routine or API endpoint provides valid values. + - Example of 3/3: "Internal competition ID. Obtain from the get_competitions + routine or the /competitions endpoint. Example: 1 = Premier League." + - Example of 2/3: "The unique competition identifier (e.g. 1 for Premier League)" + (good but doesn't say where to get other valid IDs) + - Example of 0/3: "ID" or "the season" + - 0 = missing descriptions, 1 = all present but terse, 2 = mostly good, 3 = all + excellent with sourcing info for non-obvious params + + A score ≤4 in documentation_quality is a BLOCKING issue — the routine cannot + ship with poor metadata because it will be invisible to other agents. + + ## Verdict Rules + + - overall_pass = True if: no blocking_issues AND overall_score >= 60 + - overall_score = round(sum of all 6 dimension scores / 60 × 100) (max 100) + - documentation_quality ≤ 4 → add blocking issue: "Documentation quality too low + for vectorized storage — fix routine name, description, or parameter descriptions" + - ANY HTTP 4xx/5xx in execution → add blocking issue describing the failure + - ANY unresolved placeholder → add blocking issue describing which placeholder failed + - Be STRICT on all dimensions. A broken routine is WORSE than no routine — it + wastes database space and misleads other agents. Only pass routines that + ACTUALLY WORK with REAL DATA in the execution result. + + ## Documentation-Backed Recommendations + + When you have access to documentation tools (search_files, read_file), + use them to provide SPECIFIC, actionable remediation advice in your + recommendations. Don't just say "fix the auth" — search for the relevant + doc and cite the exact fix pattern. + + Common patterns to search for: + - "TypeError: Failed to fetch" → search_files(scope="docs", query="cors-failed-to-fetch", mode="exact") → + the fix is adding a navigate operation to the allowed origin + - 401/403 errors → search_files(scope="docs", query="unauthenticated", mode="exact") → the fix is adding + auth token fetch + js_evaluate extraction before data fetches + - Placeholder issues → search_files(scope="docs", query="placeholder-not-resolved", mode="exact") → + check placeholder syntax and resolution types + - HTML instead of JSON → search_files(scope="docs", query="fetch-returns-html", mode="exact") → wrong URL + or CORS redirect + + Your recommendations should include: (1) what's wrong, (2) the specific + fix from documentation with example operations if applicable. + + IMPORTANT: Only search docs when you identify a blocking issue that has + a known fix pattern. Do NOT search docs for every inspection — only when + you can provide actionable remediation. Keep doc searches to 1-2 max per + inspection to stay within iteration limits. + + ## Process + + 1. Read the routine name and description — this is what you're scoring against + 2. Read the routine JSON — understand each operation's purpose + 3. Read the execution result — **DID IT ACTUALLY WORK?** Check EVERY operation's + HTTP status code. Check for unresolved placeholders. Check for error messages. + This is the MOST IMPORTANT step. If the execution failed, the routine fails. + 4. Cross-reference with exploration summaries — does the data match? + 5. Score each dimension with specific reasoning based on ACTUAL results + 6. List blocking issues (MUST fix) and recommendations (SHOULD fix) + - If docs are available and you identified a fixable issue, search the + common-issues docs to include a specific fix in recommendations + 7. Write a 2-3 sentence summary + 8. Call finalize_with_output with the complete inspection result + """) + + # ----------------------------------------------------------------------- + # Constructor + # ----------------------------------------------------------------------- + + def __init__( + self, + emit_message_callable: Callable[[EmittedMessage], None], + persist_chat_callable: Callable[[Chat], Chat] | None = None, + persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None, + stream_chunk_callable: Callable[[str], None] | None = None, + llm_model: LLMModel = OpenAIModel.GPT_5_1, + chat_thread: ChatThread | None = None, + existing_chats: list[Chat] | None = None, + documentation_data_loader: DocumentationDataLoader | None = None, + workspace: AgentWorkspace | None = None, + ) -> None: + super().__init__( + emit_message_callable=emit_message_callable, + workspace=workspace, + persist_chat_callable=persist_chat_callable, + persist_chat_thread_callable=persist_chat_thread_callable, + stream_chunk_callable=stream_chunk_callable, + llm_model=llm_model, + chat_thread=chat_thread, + existing_chats=existing_chats, + documentation_data_loader=documentation_data_loader, + allow_code_execution=True, + ) + logger.debug("RoutineInspector initialized") + + # ----------------------------------------------------------------------- + # Abstract method implementations + # ----------------------------------------------------------------------- + + def _get_system_prompt(self) -> str: + return self.SYSTEM_PROMPT + self._generate_code_execution_prompt() + + def _get_autonomous_system_prompt(self) -> str: + return ( + self.AUTONOMOUS_SYSTEM_PROMPT + + self._get_output_schema_prompt_section() + + self._generate_code_execution_prompt() + + self._get_documentation_prompt_section() + + self._get_urgency_notice() + ) + + def _get_autonomous_initial_message(self, task: str) -> str: + return ( + f"INSPECTION REQUEST:\n\n{task}\n\n" + "Score this routine on all 6 dimensions (including documentation_quality), " + "identify blocking issues vs. recommendations, and call finalize_with_output " + "with the complete RoutineInspectionResult.\n\n" + "CRITICAL REMINDERS:\n" + "1. CHECK THE EXECUTION RESULT FIRST. If ANY operation returned HTTP 4xx/5xx, " + "the routine FAILED. Score task_completion and data_quality ≤ 2. Do NOT " + "speculate about what 'would work' — judge what ACTUALLY happened.\n" + "2. Check for unresolved placeholders in warnings — these are automatic failures.\n" + "3. Check test_parameters for placeholder values like 'REPLACE_WITH_...' — " + "if the routine wasn't tested with real inputs, it cannot pass.\n" + "4. Documentation quality: score name, description, and parameter descriptions " + "strictly. documentation_quality ≤ 4 is a blocking issue." + ) + + def _get_autonomous_result(self) -> SpecialistResultWrapper | None: + """ + Return autonomous result with normalized/clamped inspection scores. + """ + result = super()._get_autonomous_result() + if not isinstance(result, SpecialistResultWrapper): + return result + if not result.success or not isinstance(result.output, dict): + return result + + try: + normalized = RoutineInspectionResult.model_validate(result.output) + result.output = normalized.model_dump(mode="json") + except Exception: + # Keep raw output if normalization fails. + pass + return result diff --git a/bluebox/agents/specialists/__init__.py b/bluebox/agents/specialists/__init__.py index 0871eb17..7b7debec 100644 --- a/bluebox/agents/specialists/__init__.py +++ b/bluebox/agents/specialists/__init__.py @@ -1,31 +1,26 @@ """ bluebox/agents/specialists/__init__.py -NOTE: This file is necessary because it triggers AbstractSpecialist.__init_subclass__ -for all specialist classes, populating AbstractSpecialist._subclasses list. -This enables using AbstractSpecialist.get_all_subclasses() to discover specialists. +NOTE: This file imports specialist classes so AbstractAgent subclass registration +runs for each concrete specialist at import time. """ -from bluebox.agents.specialists.abstract_specialist import ( - AbstractSpecialist, - AutonomousConfig, - RunMode, -) -from bluebox.agents.abstract_agent import agent_tool +from bluebox.agents.abstract_agent import AgentExecutionMode, AutonomousRunConfig, agent_tool -# Import all specialist classes to trigger AbstractSpecialist.__init_subclass__ +# Import all specialist classes so AbstractAgent.__init_subclass__ registers them +from bluebox.agents.specialists.dom_specialist import DOMSpecialist from bluebox.agents.specialists.interaction_specialist import InteractionSpecialist from bluebox.agents.specialists.js_specialist import JSSpecialist from bluebox.agents.specialists.network_specialist import NetworkSpecialist from bluebox.agents.specialists.value_trace_resolver_specialist import ValueTraceResolverSpecialist __all__ = [ - # Base class and utilities - "AbstractSpecialist", - "AutonomousConfig", - "RunMode", + # Utilities + "AutonomousRunConfig", + "AgentExecutionMode", "agent_tool", # Concrete specialists + "DOMSpecialist", "InteractionSpecialist", "JSSpecialist", "NetworkSpecialist", diff --git a/bluebox/agents/specialists/abstract_specialist.py b/bluebox/agents/specialists/abstract_specialist.py deleted file mode 100644 index cacca874..00000000 --- a/bluebox/agents/specialists/abstract_specialist.py +++ /dev/null @@ -1,606 +0,0 @@ -""" -bluebox/agents/specialists/abstract_specialist.py - -Abstract base class for specialist agents. - -Specialists are domain-expert agents that an orchestrator deploys for specific tasks. -Each specialist owns: - - A system prompt (conversational + autonomous variants) - - A set of LLM tools and their execution logic - - Finalize tools for autonomous mode (registered after min_iterations) - -This class extends AbstractAgent to add: - - Autonomous mode with iteration tracking and finalize gating - - Conversational mode for interactive chat - -Tools are defined declaratively via the @agent_tool decorator. -""" - -from __future__ import annotations - -import json -from abc import abstractmethod -from enum import StrEnum -from typing import TYPE_CHECKING, Any, Callable, ClassVar, NamedTuple - -import jsonschema -from pydantic import BaseModel - -from bluebox.agents.abstract_agent import AbstractAgent, agent_tool -from bluebox.data_models.orchestration.result import SpecialistResultWrapper -from bluebox.utils.llm_utils import token_optimized -from bluebox.data_models.llms.interaction import ( - Chat, - ChatRole, - ChatThread, - EmittedMessage, - ChatResponseEmittedMessage, - ErrorEmittedMessage, -) -from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel -from bluebox.utils.logger import get_logger - -if TYPE_CHECKING: - from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader - -logger = get_logger(name=__name__) - - - -class RunMode(StrEnum): - """How the specialist is being run.""" - CONVERSATIONAL = "conversational" # interactive chat with a user - AUTONOMOUS = "autonomous" # autonomous loop (exploration + finalization) - - -class AutonomousConfig(NamedTuple): - """ - Configuration for autonomous specialist runs. Helps manage their "lifecycles." - """ - min_iterations: int = 3 # Minimum iterations before finalize tools become available - max_iterations: int = 10 # Maximum iterations before loop exits (returns None if not finalized) - - -class AbstractSpecialist(AbstractAgent): - """ - Abstract base class for specialist agents. - - Subclasses implement domain-specific logic by overriding: - - _get_system_prompt() - - _get_autonomous_system_prompt() - - _get_autonomous_initial_message() - - _check_autonomous_completion() — inspect tool results for finalize signals - - Tools are defined declaratively via the @agent_tool decorator on handler - methods. Each tool's ``availability`` controls when it is registered: True - (always), or a callable evaluated before each LLM call. - - This class extends AbstractAgent with: - - Autonomous mode with iteration tracking and finalize gating - - Conversational mode for interactive chat - """ - - ## Class-level tracking of all specialist subclasses - _subclasses: ClassVar[list[type[AbstractSpecialist]]] = [] - - ## Magic methods - - def __init__( - self, - emit_message_callable: Callable[[EmittedMessage], None], - persist_chat_callable: Callable[[Chat], Chat] | None = None, - persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None, - stream_chunk_callable: Callable[[str], None] | None = None, - llm_model: LLMModel = OpenAIModel.GPT_5_2, - run_mode: RunMode = RunMode.CONVERSATIONAL, - chat_thread: ChatThread | None = None, - existing_chats: list[Chat] | None = None, - documentation_data_loader: DocumentationDataLoader | None = None, - ) -> None: - """ - Initialize the specialist. - - Args: - emit_message_callable: Callback to emit messages to the host. - persist_chat_callable: Optional callback to persist Chat objects. - persist_chat_thread_callable: Optional callback to persist ChatThread. - stream_chunk_callable: Optional callback for streaming text chunks. - llm_model: The LLM model to use. - run_mode: How the specialist will be run (conversational or autonomous). - chat_thread: Existing ChatThread to continue, or None for new. - existing_chats: Existing Chat messages if loading from persistence. - documentation_data_loader: Optional DocumentationDataLoader for docs/code search tools. - """ - # lifecycle state (must be set before parent __init__, which calls _sync_tools) - self.run_mode: RunMode = run_mode - self._autonomous_iteration: int = 0 - self._autonomous_config: AutonomousConfig = AutonomousConfig() - - # orchestrator-defined output schema (set via set_output_schema()) - self._task_output_schema: dict[str, Any] | None = None - self._task_output_description: str | None = None - self._notes: list[str] = [] - self._wrapped_result: SpecialistResultWrapper | None = None - - # call parent init - super().__init__( - emit_message_callable=emit_message_callable, - persist_chat_callable=persist_chat_callable, - persist_chat_thread_callable=persist_chat_thread_callable, - stream_chunk_callable=stream_chunk_callable, - llm_model=llm_model, - chat_thread=chat_thread, - existing_chats=existing_chats, - documentation_data_loader=documentation_data_loader, - ) - - def __init_subclass__(cls: type[AbstractSpecialist], **kwargs: NamedTuple) -> None: - """Register subclass when it's defined.""" - super().__init_subclass__(**kwargs) - # Only register concrete specialists (not intermediate ABCs) - if not cls.__name__.startswith("Abstract"): - cls._subclasses.append(cls) - - ## Class methods - - @classmethod - def get_all_subclasses(cls) -> list[type[AbstractSpecialist]]: - """Return a copy of all registered specialist subclasses.""" - return cls._subclasses.copy() - - @classmethod - def get_by_type(cls, agent_type: str) -> type[AbstractSpecialist] | None: - """ - Look up a specialist class by name. - - Args: - agent_type: The class name (e.g., "NetworkSpecialist", "JSSpecialist"). - - Returns: - The specialist class, or None if not found. - """ - for subclass in cls._subclasses: - if subclass.__name__ == agent_type: - return subclass - return None - - @classmethod - def get_all_agent_types(cls) -> list[str]: - """Return all registered specialist class names.""" - return [subclass.__name__ for subclass in cls._subclasses] - - ## Additional abstract methods for autonomous mode - - @abstractmethod - def _get_autonomous_system_prompt(self) -> str: - """ - Return the system prompt for autonomous mode. - - Called every iteration, so it can include dynamic context - (e.g., iteration count, urgency notices). - """ - - @abstractmethod - def _get_autonomous_initial_message(self, task: str) -> str: - """ - Build the initial USER message for autonomous mode. - - Args: - task: The user's task description. - - Returns: - Message string to seed the autonomous conversation. - """ - - def _check_autonomous_completion(self, tool_name: str) -> bool: - """ - Check whether a tool call signals autonomous completion. - - Called after each tool execution in the autonomous loop. - Return True to stop the loop (e.g., finalize_result was called - and self._autonomous_result is now set). - - Default implementation checks for the generic finalize tools: - - finalize_with_output, finalize_with_failure (with output schema) - - finalize_result, finalize_failure (without output schema) - - Subclasses should override this and call super() to also check - for their own specialist-specific finalize tools. - - Args: - tool_name: Name of the tool that was just executed. - - Returns: - True if the autonomous loop should stop. - """ - # Check for generic finalize tools (both with-schema and without-schema variants) - finalize_tools = ( - "finalize_with_output", - "finalize_with_failure", - "finalize_result", - "finalize_failure", - ) - if tool_name in finalize_tools: - return self._wrapped_result is not None - return False - - def _get_autonomous_result(self) -> BaseModel | None: - """ - Return the autonomous mode result after the loop completes. - - Default implementation returns the wrapped result if set via the - generic finalize tools. Subclasses should override this and check - for _wrapped_result first, then fall back to their own result types. - - Returns: - A Pydantic model with the specialist's result, - or None if max iterations were reached without finalization. - """ - return self._wrapped_result - - ## Properties - - @property - def autonomous_iteration(self) -> int: - """Return the current/final autonomous iteration count.""" - return self._autonomous_iteration - - @property - def can_finalize(self) -> bool: - """ - Whether "finalize tools" should be available (autonomous mode, past min_iterations). - - Returns: - True if the specialist is in autonomous mode and has exceeded the min_iterations threshold, False otherwise. - """ - return ( - self.run_mode == RunMode.AUTONOMOUS - and self._autonomous_iteration >= self._autonomous_config.min_iterations - ) - - @property - def has_output_schema(self) -> bool: - """Whether an output schema has been set by the orchestrator.""" - return self._task_output_schema is not None - - ## Output Schema Methods - - def set_output_schema( - self, - schema: dict[str, Any], - description: str | None = None, - ) -> None: - """ - Set the expected output schema for this task. - - Called by the orchestrator before running the specialist to define - what structure the specialist should return. - - Args: - schema: JSON Schema defining the expected output structure. - description: Human-readable description of what to return. - """ - self._task_output_schema = schema - self._task_output_description = description - - def _get_output_schema_prompt_section(self) -> str: - """ - Get the output schema section to include in autonomous system prompt. - - Subclasses should call this and include it in their _get_autonomous_system_prompt(). - - Returns: - Formatted prompt section describing expected output, or empty string if no schema set. - """ - if not self._task_output_schema: - return "" - - parts = ["\n\n## Expected Output Schema\n"] - - if self._task_output_description: - parts.append(f"**Description:** {self._task_output_description}\n\n") - - parts.append("**Schema:**\n```json\n") - parts.append(json.dumps(self._task_output_schema, indent=2)) - parts.append("\n```\n") - - parts.append( - "\nWhen ready, call `finalize_with_output(output={...})` with data matching this schema. " - "Use `add_note()` before finalizing to record any notes, complaints, warnings, or errors." - ) - - return "".join(parts) - - def _get_urgency_notice(self) -> str: - """ - Iteration-aware urgency notice for autonomous system prompts. - - Appended to autonomous prompts to nudge the LLM toward finalizing. - Replaces the per-specialist urgency logic that was previously duplicated. - """ - finalize_tool = "finalize_with_output" if self.has_output_schema else "finalize_result" - - if self.can_finalize: - remaining = self._autonomous_config.max_iterations - self._autonomous_iteration - if remaining <= 2: - return f"\n\n## URGENT: Only {remaining} iteration(s) left — call `{finalize_tool}` NOW." - if remaining <= 4: - return f"\n\n## Finalize soon — {remaining} iterations remaining." - return f"\n\n## `{finalize_tool}` is now available." - return f"\n\n## Continue exploring (iteration {self._autonomous_iteration})." - - @agent_tool - def add_note(self, note: str) -> dict[str, Any]: - """ - Add a note to the result wrapper. - - Use this for notes, complaints, warnings, or errors encountered during execution. - These are passed back to the orchestrator along with the result. - - Args: - note: The note/complaint/warning/error message. - """ - self._notes.append(note) - return {"status": "ok", "total_notes": len(self._notes)} - - ## Generic Finalize Tool (for orchestrator-defined schemas) - - @agent_tool(availability=lambda self: self.can_finalize and self.has_output_schema) - @token_optimized - def _finalize_with_output(self, output: dict[str, Any]) -> dict[str, Any]: - """ - Finalize with output matching the orchestrator's expected schema. - - This tool is available when the orchestrator has defined an output schema - for the task. The output must match the schema or validation will fail. - - Args: - output: Result data matching the expected output schema. - """ - if not self._task_output_schema: - return {"error": "No output schema defined for this task"} - - # Validate against schema - try: - jsonschema.validate(instance=output, schema=self._task_output_schema) - except jsonschema.ValidationError as e: - return { - "error": "Output does not match expected schema", - "validation_error": str(e.message), - "schema_path": list(e.absolute_schema_path), - "hint": "Fix the output structure and try again.", - } - - # Store the wrapped result - self._wrapped_result = SpecialistResultWrapper( - output=output, - success=True, - notes=self._notes.copy(), - failure_reason=None, - ) - - logger.info("Specialist finalized with output matching schema") - return { - "status": "success", - "message": "Output validated and stored successfully", - "notes_count": len(self._notes), - } - - @agent_tool(availability=lambda self: self.can_finalize and self.has_output_schema) - @token_optimized - def _finalize_with_failure(self, reason: str) -> dict[str, Any]: - """ - Finalize with failure when the task cannot be completed. - - Use this when you cannot produce the expected output after thorough analysis. - - Args: - reason: Explanation of why the task could not be completed. - """ - self._wrapped_result = SpecialistResultWrapper( - output=None, - success=False, - notes=self._notes.copy(), - failure_reason=reason, - ) - - logger.info("Specialist finalized with failure: %s", reason) - return { - "status": "failure", - "message": "Task marked as failed", - "reason": reason, - } - - ## Generic Finalize Tools (for tasks without output schema) - - @agent_tool(availability=lambda self: self.can_finalize and not self.has_output_schema) - @token_optimized - def _finalize_result(self, output: dict[str, Any]) -> dict[str, Any]: - """ - Finalize and return the result of your analysis. - - Use this to submit your findings when you have completed the task. - The output should contain all relevant information discovered. - - Args: - output: Dictionary containing your findings and analysis results. - """ - self._wrapped_result = SpecialistResultWrapper( - output=output, - success=True, - notes=self._notes.copy(), - ) - - logger.info("Specialist finalized with result (no schema)") - return { - "status": "success", - "message": "Result submitted successfully", - "notes_count": len(self._notes), - } - - @agent_tool(availability=lambda self: self.can_finalize and not self.has_output_schema) - @token_optimized - def _finalize_failure(self, reason: str) -> dict[str, Any]: - """ - Finalize with failure when the task cannot be completed. - - Use this when you cannot produce results after thorough analysis. - - Args: - reason: Explanation of why the task could not be completed. - """ - self._wrapped_result = SpecialistResultWrapper( - output=None, - success=False, - notes=self._notes.copy(), - failure_reason=reason, - ) - - logger.info("Specialist finalized with failure (no schema): %s", reason) - return { - "status": "failure", - "message": "Task marked as failed", - "reason": reason, - } - - ## Public API - - def run_autonomous( - self, - task: str, - config: AutonomousConfig | None = None, - output_schema: dict[str, Any] | None = None, - output_description: str | None = None, - ) -> BaseModel | None: - """ - Run the specialist autonomously to completion. - - The specialist will: - 1. Use its tools to explore and analyze data - 2. After min_iterations, finalize tools become available (via can_finalize) - 3. Return a typed result when finalize is called, or None on timeout - - Args: - task: User task description. - config: Autonomous run configuration (iterations limits). Uses defaults if None. - output_schema: JSON Schema defining expected output structure. - output_description: Human-readable description of expected output. - - Returns: - Specialist-specific result model, or None if max iterations reached. - """ - self.run_mode = RunMode.AUTONOMOUS - self._autonomous_iteration = 0 - self._autonomous_config = config or AutonomousConfig() - - # Subclass should reset its own result fields in _reset_autonomous_state() - self._reset_autonomous_state() - - # Set output schema AFTER reset (so it doesn't get cleared) - if output_schema: - self.set_output_schema(output_schema, output_description) - - # Seed the conversation - initial_message = self._get_autonomous_initial_message(task) - self._add_chat(ChatRole.USER, initial_message) - - logger.info( - "Starting %s autonomous run for task: %s", - self.__class__.__name__, task, - ) - - self._run_autonomous_loop() - - self.run_mode = RunMode.CONVERSATIONAL - - return self._get_autonomous_result() - - def _reset_autonomous_state(self) -> None: - """ - Reset autonomous-mode state before a new run. - - Override in subclasses to clear specialist-specific result fields - (e.g., self._discovery_result = None). Call super() first. - - NOTE: Method is not abstract; it is intentionally a no-op by default. Not every specialist - has extra autonomous state to reset; those that don't simply inherit this. - """ - # Clear orchestrator-defined output schema state - self._task_output_schema = None - self._task_output_description = None - self._notes = [] - self._wrapped_result = None - - def reset(self) -> None: - """Reset the conversation to a fresh state.""" - # Reset autonomous state - self.run_mode = RunMode.CONVERSATIONAL - self._autonomous_iteration = 0 - self._reset_autonomous_state() - - # Call parent reset - super().reset() - - ## Agent loops - - def _run_autonomous_loop(self) -> None: - """Run the autonomous agent loop with iteration tracking and finalize gating.""" - max_iterations = self._autonomous_config.max_iterations - for iteration in range(max_iterations): - self._autonomous_iteration = iteration + 1 - logger.debug("Autonomous loop iteration %d/%d", self._autonomous_iteration, max_iterations) - - messages = self._build_messages_for_llm() - try: - # Use tool_choice="required" to force the LLM to always call a tool - # This prevents the loop from exiting due to text-only responses - response = self._call_llm( - messages, - self._get_autonomous_system_prompt(), - tool_choice="required", - ) - - if response.response_id: - self._previous_response_id = response.response_id - - if response.content or response.tool_calls: - chat = self._add_chat( - role=ChatRole.ASSISTANT, - content=response.content or "", - tool_calls=response.tool_calls if response.tool_calls else None, - llm_provider_response_id=response.response_id, - ) - if response.content: - self._emit_message( - ChatResponseEmittedMessage( - content=response.content, - chat_id=chat.id, - chat_thread_id=self._thread.id, - ) - ) - - if not response.tool_calls: - # This shouldn't happen with tool_choice="required", but handle it just in case - logger.warning("Autonomous loop: no tool calls in iteration %d (unexpected with tool_choice=required)", self._autonomous_iteration) - return - - # Process tool calls and check for completion - for tool_call in response.tool_calls: - result_str = self._auto_execute_tool(tool_call.tool_name, tool_call.tool_arguments) - - self._add_chat( - role=ChatRole.TOOL, - content=f"Tool '{tool_call.tool_name}' result: {result_str}", - tool_call_id=tool_call.call_id, - ) - - if self._check_autonomous_completion(tool_call.tool_name): - logger.debug("Autonomous run completed at iteration %d", self._autonomous_iteration) - return - - except Exception as e: - logger.exception("Error in autonomous loop: %s", e) - self._emit_message(ErrorEmittedMessage(error=str(e))) - return - - logger.warning("Autonomous loop hit max iterations (%d) without finalization", max_iterations) diff --git a/bluebox/agents/specialists/dom_specialist.py b/bluebox/agents/specialists/dom_specialist.py new file mode 100644 index 00000000..7ab55868 --- /dev/null +++ b/bluebox/agents/specialists/dom_specialist.py @@ -0,0 +1,345 @@ +""" +bluebox/agents/specialists/dom_specialist.py + +DOM specialist agent. + +Analyzes captured DOM snapshots to discover page structure, interactive elements, +forms, tables, links, and navigation patterns. Used during the exploration phase +to understand what the browser rendered and what UI elements are available. +""" + +from __future__ import annotations + +from textwrap import dedent +from typing import TYPE_CHECKING, Any, Callable + +from bluebox.agents.abstract_agent import AbstractAgent, AgentCard, agent_tool +from bluebox.workspace import AgentWorkspace, LocalAgentWorkspace +from bluebox.data_models.llms.interaction import ( + Chat, + ChatThread, + EmittedMessage, +) +from bluebox.data_models.llms.vendors import LLMModel, OpenAIModel +from bluebox.llms.data_loaders.dom_data_loader import DOMDataLoader +from bluebox.utils.logger import get_logger + +if TYPE_CHECKING: + from bluebox.llms.data_loaders.documentation_data_loader import DocumentationDataLoader + +logger = get_logger(name=__name__) + + +class DOMSpecialist(AbstractAgent): + """ + DOM specialist agent. + + Analyzes captured DOM snapshots to discover page structure, + interactive elements, forms, and navigation patterns. + """ + + AGENT_CARD = AgentCard( + description=( + "Analyzes captured DOM snapshots (page structure, forms, inputs, buttons, " + "links, tables, headings). Useful for understanding what the browser rendered " + "and what interactive elements exist on each page." + ), + ) + SYSTEM_PROMPT: str = dedent("""\ + You are a DOM structure analyst specializing in understanding web page layouts from captured browser snapshots. + + ## What You Analyze + + - **Forms**: Login forms, search forms, checkout forms — with their inputs, actions, and methods + - **Elements**: Inputs, buttons, links, headings, meta tags, hidden inputs, clickable elements + - **Tables**: Data tables with headers and row counts + - **Script tags**: Server-side data blobs (__NEXT_DATA__, __NUXT__), inline JSON config, structured data (ld+json) + + ## What to Ignore + + - Internal framework nodes, shadow DOM internals + - Style/layout-only elements with no semantic meaning + + ## How to Work + + 1. Start with `list_pages` to see all captured pages + 2. Use `get_elements(element_type=...)` to scan for inputs, buttons, links, headings, meta_tags, hidden_inputs, or clickable elements + 3. Use `get_forms` for forms with their child inputs + 4. Use `get_tables` for data tables + 5. Use `get_scripts` to find server-side data blobs and inline configuration + 6. Use `get_snapshot_diff` to understand what changed between pages + 7. Use `search_strings` to find specific content across snapshots + + """) + + AUTONOMOUS_SYSTEM_PROMPT: str = dedent("""\ + You are a DOM structure analyst that autonomously maps out page structure from captured browser snapshots. + + ## Your Mission + + Analyze all captured DOM snapshots to produce a complete picture of: + - What pages were visited and in what order + - What forms exist and what they do (action URLs, input fields) + - What interactive elements are available (buttons, links, inputs) + - What data is displayed (tables, headings, text content) + - What tokens/keys are embedded in the page (CSRF, session IDs, API keys) + - What server-side data is rendered into the DOM (__NEXT_DATA__, inline JSON, ld+json) + + ## Process + + 1. **Survey**: Use `list_pages` to see all captured pages + 2. **Scan forms**: Use `get_forms` to find all forms with their inputs + 3. **Scan elements**: Use `get_elements(element_type=...)` for each type: + - `inputs` — text fields, dropdowns, checkboxes, date pickers + - `buttons` — submit buttons, action buttons + - `links` — anchor links with href values + - `headings` — H1-H6 page structure + - `meta_tags` — CSRF tokens, API configs, verification keys + - `hidden_inputs` — CSRF tokens, session IDs, form tokens + - `clickable` — anything the browser marked as interactive + 4. **Scan tables**: Use `get_tables` for data displays + 5. **Scan scripts**: Use `get_scripts` to find __NEXT_DATA__, inline JSON, framework state blobs + 6. **Check diffs**: Use `get_snapshot_diff` between consecutive pages to see what changed + 7. **Finalize**: Call the appropriate finalize tool with your findings + + ## Output Focus + + Prioritize: forms and their endpoints, parameterizable inputs, action buttons, data tables, + embedded tokens/keys, and server-side data blobs. These are what matter for routine construction. + """) + + ## Magic methods + + def __init__( + self, + emit_message_callable: Callable[[EmittedMessage], None], + dom_data_loader: DOMDataLoader, + documentation_data_loader: DocumentationDataLoader | None = None, + persist_chat_callable: Callable[[Chat], Chat] | None = None, + persist_chat_thread_callable: Callable[[ChatThread], ChatThread] | None = None, + stream_chunk_callable: Callable[[str], None] | None = None, + llm_model: LLMModel = OpenAIModel.GPT_5_1, + chat_thread: ChatThread | None = None, + existing_chats: list[Chat] | None = None, + workspace: AgentWorkspace | None = None, + ) -> None: + self._dom_data_loader = dom_data_loader + + super().__init__( + emit_message_callable=emit_message_callable, + workspace=workspace or LocalAgentWorkspace.from_directory_path("./agent_workspace/specialist"), + persist_chat_callable=persist_chat_callable, + persist_chat_thread_callable=persist_chat_thread_callable, + stream_chunk_callable=stream_chunk_callable, + llm_model=llm_model, + chat_thread=chat_thread, + existing_chats=existing_chats, + documentation_data_loader=documentation_data_loader, + ) + logger.debug( + "DOMSpecialist initialized with %d snapshots", + self._dom_data_loader.stats.total_snapshots, + ) + + ## Abstract method implementations + + def _get_system_prompt(self) -> str: + stats = self._dom_data_loader.stats + context = ( + f"\n\n## DOM Data Context\n" + f"- Total Snapshots: {stats.total_snapshots}\n" + f"- Unique URLs: {stats.unique_urls}\n" + f"- Unique Titles: {stats.unique_titles}\n" + f"- Hosts: {', '.join(stats.hosts.keys())}\n" + ) + return self.SYSTEM_PROMPT + context + + def _get_autonomous_system_prompt(self) -> str: + stats = self._dom_data_loader.stats + context = ( + f"\n\n## DOM Data Context\n" + f"- Total Snapshots: {stats.total_snapshots}\n" + f"- Unique URLs: {stats.unique_urls}\n" + f"- Unique Titles: {stats.unique_titles}\n" + f"- Hosts: {', '.join(stats.hosts.keys())}\n" + ) + + return ( + self.AUTONOMOUS_SYSTEM_PROMPT + + context + + self._get_output_schema_prompt_section() + + self._get_urgency_notice() + ) + + def _get_autonomous_initial_message(self, task: str) -> str: + finalize_success = "finalize_with_output" if self.has_output_schema else "finalize_result" + + return ( + f"TASK: {task}\n\n" + f"Analyze the captured DOM snapshots to map out page structure, forms, " + f"inputs, buttons, links, tables, and navigation patterns. " + f"When confident, use {finalize_success} to report your findings." + ) + + ## Tool handlers + + @agent_tool(token_optimized=True) + def _list_pages(self) -> dict[str, Any]: + """List all captured pages with their URLs, titles, and snapshot metadata.""" + pages = self._dom_data_loader.list_pages() + return { + "total_pages": len(pages), + "pages": pages, + } + + @agent_tool(token_optimized=True) + def _get_elements(self, element_type: str, snapshot_index: int | None = None) -> dict[str, Any]: + """ + Get elements of a specific type from DOM snapshots. + + A single tool that replaces individual per-type tools. Supports: + - 'inputs' — INPUT, SELECT, TEXTAREA fields with their attributes and values + - 'buttons' — BUTTON elements and INPUT type=submit/button + - 'links' — anchor links () with href values + - 'headings' — H1-H6 elements with their text content + - 'meta_tags' — META elements (CSRF tokens, API endpoints, OG tags, page config) + - 'hidden_inputs' — INPUT type=hidden (CSRF tokens, session IDs, form tokens) + - 'clickable' — all elements marked as clickable by the browser + + Args: + element_type: One of 'inputs', 'buttons', 'links', 'headings', 'meta_tags', 'hidden_inputs', 'clickable'. + snapshot_index: If provided, only search this specific snapshot. Otherwise searches all. + """ + try: + results = self._dom_data_loader.get_elements(element_type, snapshot_index) + except ValueError as e: + return {"error": str(e)} + + total = sum(len(r["elements"]) for r in results) + return { + "element_type": element_type, + "total_elements": total, + "snapshots_with_elements": len(results), + "results": results, + } + + @agent_tool(token_optimized=True) + def _get_forms(self, snapshot_index: int | None = None) -> dict[str, Any]: + """ + Get all
elements with their action URL, method, and child inputs. + + Args: + snapshot_index: If provided, only search this specific snapshot. Otherwise searches all. + """ + results = self._dom_data_loader.get_forms(snapshot_index) + total = sum(len(r["forms"]) for r in results) + return { + "total_forms": total, + "snapshots_with_forms": len(results), + "results": results, + } + + @agent_tool(token_optimized=True) + def _get_tables(self, snapshot_index: int | None = None) -> dict[str, Any]: + """ + Get all elements with their headers and row counts. + + Args: + snapshot_index: If provided, only search this specific snapshot. Otherwise searches all. + """ + results = self._dom_data_loader.get_tables(snapshot_index) + total = sum(len(r["tables"]) for r in results) + return { + "total_tables": total, + "snapshots_with_tables": len(results), + "results": results, + } + + @agent_tool(token_optimized=True) + def _get_scripts(self, snapshot_index: int | None = None, max_inline_chars: int = 2000) -> dict[str, Any]: + """ + Get all