From 5f4cdab2dbdba64d572609d0c9a660919e915f97 Mon Sep 17 00:00:00 2001
From: hubertpysklo <hubert@uni.minerva.edu>
Date: Wed, 18 Feb 2026 10:39:51 +0530
Subject: [PATCH] Update Examples

---
 README.md                                | 202 ++++--------
 examples/langchain_agent_benchmark.ipynb | 324 ++++++++-----------
 examples/react_agent_benchmark.ipynb     | 380 ++++++-----------------
 3 files changed, 285 insertions(+), 621 deletions(-)
diff --git a/README.md b/README.md
index c7df422..9c84bbf 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,12 @@ Run it locally (or deploy it). Agents call sandboxed replicas of APIs that behav
   <a href="mailto:hubert@uni.minerva.edu">Feedback</a>
 </p>
 
+### Try it now
+
+| Notebook | Description | |
+|----------|-------------|---|
+| [ReAct Agent (Paper)](examples/react_agent_benchmark.ipynb) | Custom ReAct loop matching the paper methodology | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/react_agent_benchmark.ipynb) |
+| [LangChain Agent](examples/langchain_agent_benchmark.ipynb) | LangChain agent with tool calling | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/langchain_agent_benchmark.ipynb) |
 
 ## Quick Start
 
@@ -52,7 +58,7 @@ export AGENT_DIFF_BASE_URL="https://api.agentdiff.dev"
 <summary><b>Self-Hosted</b></summary>
 
 ```bash
-git clone https://github.com/hubertpysklo/agent-diff.git
+git clone https://github.com/agent-diff-bench/agent-diff.git
 cd agent-diff/ops
 docker-compose up --build
 # Backend runs on http://localhost:8000
@@ -60,61 +66,44 @@ docker-compose up --build
 
 </details>
 
-### 3. Flow
+### 3. Use
+
 ```python
 from agent_diff import AgentDiff
 
-# Self-hosted (defaults to http://localhost:8000)
 client = AgentDiff()
 
-# Initialise isolated environment from a template. See: examples/slack/seeds
-env = client.init_env(templateService="slack", templateName="slack_default",
-impersonateUserId="U01AGENBOT9", TTL="3600") #impersonateUserId - seeded user account that agent will use
+# Create an isolated environment from a template
+env = client.init_env(
+    templateService="slack",
+    templateName="slack_default",
+    impersonateUserId="U01AGENBOT9",
+)
 
-# print(env.environmentUrl) = http://localhost:8000/api/env/{environmentId}/services/slack
-
-# Take before snapshot
+# Snapshot before agent runs
 run = client.start_run(envId=env.environmentId)
 
-# Your agent does stuff using the environment URL 
-# You can swap the URLs in MCPs or use the code executor tool (Python or bash) with a proxy 
-
-# Using CodeExecutorProxy with OpenAI Agents SDK (For Vercel AI, check TS SDK docs)
-from agent_diff import PythonExecutorProxy, create_openai_tool
-from agents import Agent, Runner
+# --- Your agent interacts with the API here ---
+# SDK provides code execution proxies (Python/Bash) for OpenAI Agents, LangChain, etc.
+# Agent writes normal code (e.g. requests.post('https://slack.com/api/chat.postMessage', ...))
+# which is automatically intercepted and routed to the sandboxed environment.
 
-# Create executor (auto-loads from AGENT_DIFF_API_KEY and AGENT_DIFF_BASE_URL env vars)
-python_executor = PythonExecutorProxy(env.environmentId)
-python_tool = create_openai_tool(python_executor) 
+from agent_diff import BashExecutorProxy, create_openai_tool
+bash = BashExecutorProxy(env.environmentId)
+tool = create_openai_tool(bash)  # also: create_langchain_tool, create_smolagents_tool
 
-agent = Agent(
-        name="Slack Assistant",
-        instructions="Use execute_python tool to interact with Slack API at https://slack.com/api/*. Complete the task using the tools provided. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.",
-        tools=[python_tool] # python_tool (or bash_tool) where agent will write code
-    )
-
-response = await Runner.run(agent, "Post 'Hello' to Slack channel #general")
-
-# The agent writes normal code like:
-# requests.post('https://slack.com/api/chat.postMessage', ...)
-# But it will be proxied to the temporary sandbox environment
-# e.g. transforms:
-# from: https://api.slack.com/api/conversations.list
-# to: http://localhost:8000/api/env/{environmentId}/services/slack/conversations.list 
-
-# Compute diff (changes in the environment) and get results
+# Compute state diff and inspect changes
 diff = client.diff_run(runId=run.runId)
-
-# Inspect changes
-print(diff.diff['inserts'])   # New records, e.g. new message or user added by agent
-print(diff.diff['updates'])   # Modified records, edited message
-print(diff.diff['deletes'])   # Deleted records, deleted message, linear issue, etc.
+print(diff.diff['inserts'])   # new records created by agent
+print(diff.diff['updates'])   # modified records
+print(diff.diff['deletes'])   # deleted records
 
 # Clean up
 client.delete_env(envId=env.environmentId)
-
 ```
 
+See the [Python SDK](sdk/agent-diff-python/README.md) and [TS SDK](sdk/agent-diff-ts/README.md) for full reference.
+
 ## Supported APIs
 
 - **Box** – REST API for file/folder management, search, comments, tags, shared links, hubs, and content versioning. See [`backend/src/services/box/README.md`](backend/src/services/box/README.md). 27 endpoints.
@@ -130,9 +119,9 @@ client.delete_env(envId=env.environmentId)
 ## Templates, Seeds & Environments
 
 **Templates** are pre-configured database schemas that serve as the starting point for test environments. Think of them as snapshots of a service's state:
-- **Location**: Templates live in PostgreSQL schemas (e.g., `slack_default`, `linear_base`)
-- **Content**: Templates are seeded during startup time from seeds with data like users, channels, messages, issues, etc.
-- **Example Seeds**: **[slack_default](examples/slack/seeds/slack_bench_default.json)** - sample users, channels and messages.
+- **Location**: Templates live in PostgreSQL schemas (e.g., `slack_default`, `box_default`, `linear_expanded`, `calendar_base`)
+- **Content**: Seeded with realistic data — users, channels, messages, files, folders, issues, calendar events, etc.
+- **Seeds**: [box](examples/box/seeds/) | [calendar](examples/calendar/seeds/) | [linear](examples/linear/seeds/) | [slack](examples/slack/seeds/)
 
 <img width="2330" height="688" alt="image" src="https://github.com/user-attachments/assets/481d3f40-e378-402c-9d3c-8a2ab75c880e" />
 
@@ -144,43 +133,12 @@ client.delete_env(envId=env.environmentId)
 <img width="2344" height="432" alt="image" src="https://github.com/user-attachments/assets/c61e93f2-1826-429e-8ee7-4a32f4172a38" />
 
 
-## CodeExecutorProxy
-
-SDK provides **code execution proxies** - tools for AI agents. You add it to your toolbox in Vercel AI SDK, Langchain or OpenAI Agents, making LLM write Python or Bash code to talk with Slack or Linear API. Requests will automatically be intercepted and routed to isolated test environments. This enables agents to interact with service replicas without any code changes. See more in: **[Python SDK](sdk/agent-diff-python/README.md)** 
-
-
-## Paper
-
-> **Agent-Diff: Benchmarking LLM Agents on Enterprise API Tasks via Code Execution with State-Diff-Based Evaluation**
-> Hubert M. Pysklo, Artem Zhuravel, Patrick D. Watson
-> *Pre-print. Under review for KDD 2026.*
-> [arXiv:2602.11224](https://arxiv.org/abs/2602.11224)
-
-If you use Agent-Diff in your research, please cite:
-
-```bibtex
-@article{pysklo2025agentdiff,
-  title={Agent-Diff: Benchmarking LLM Agents on Enterprise API Tasks via Code Execution with State-Diff-Based Evaluation},
-  author={Pysklo, Hubert M. and Zhuravel, Artem and Watson, Patrick D.},
-  journal={arXiv preprint arXiv:2602.11224},
-  year={2025}
-}
-```
 
 ## Run Evaluations
 
-The fastest way to run Agent-Diff evaluations is via **[Prime Intellect](https://app.primeintellect.ai/dashboard/environments/hubert-marek/agent-diff-bench)** — run evals or RL training with no setup required.
-
-Alternatively, run locally or self-hosted using the SDK (see [To run evaluations](#to-run-evaluations) below).
-
-### Example Notebooks
-
-- **[ReAct Agent (Paper)](examples/react_agent_benchmark.ipynb)** — Custom ReAct loop matching the paper methodology [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/react_agent_benchmark.ipynb)
-- **[LangChain Agent](examples/langchain_agent_benchmark.ipynb)** — LangChain agent with tool calling [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/langchain_agent_benchmark.ipynb)
-
-**Resources:**
-- **Dataset**: [hubertmarek/agent-diff-bench](https://huggingface.co/datasets/hubertmarek/agent-diff-bench) — 224 tasks across all 4 services (80/20 train/test split)
-- **Prime Intellect**: [agent-diff-bench on Prime Lab](https://app.primeintellect.ai/dashboard/environments/hubert-marek/agent-diff-bench) — hosted evaluations & RL training
+- **[Prime Intellect](https://app.primeintellect.ai/dashboard/environments/hubert-marek/agent-diff-bench)** — Run evals or RL training with no setup required
+- **[Colab Notebooks](#try-it-now)** — Run locally with the example notebooks above
+- **[Dataset](https://huggingface.co/datasets/hubertmarek/agent-diff-bench)** — 224 tasks across all 4 services (80/20 train/test split)
 
 ## Benchmark
 
@@ -232,81 +190,37 @@ Tasks are characterized along five dimensions: _task horizon_ (minimum API calls
 
 Per-service assertion-weighted scores (95% Bayesian CrI). No-docs baseline: agents receive no API documentation and must discover endpoints through exploration. 3 trials per task. Full methodology and documentation ablation results in the [paper](https://arxiv.org/abs/2602.11224).
 
-## Evaluations & Test Suites
-
-Collections of test cases with assertions that you can run against agent runs using evaluations.
+## Test Suites
 
-- **[box_bench.json](examples/box/testsuites/box_bench.json)** - test cases covering file/folder operations, search, tags, comments, hubs, and content versioning
-- **[calendar_bench.json](examples/calendar/testsuites/calendar_bench.json)** - test cases covering event CRUD, recurring events, free/busy queries, ACL management, and calendar lifecycle
-- **[linear_bench.json](examples/linear/testsuites/linear_bench.json)** - test cases covering issue management, labels, comments, workflow states, and team operations
-- **[slack_bench.json](examples/slack/testsuites/slack_bench.json)** - test cases covering message sending, channel ops, reactions, threading
+| Service | Test Suite | Tests | Coverage |
+|---------|-----------|-------|----------|
+| Box | [box_bench.json](examples/box/testsuites/box_bench.json) | 48 | File/folder ops, search, tags, comments, hubs, versioning |
+| Calendar | [calendar_bench.json](examples/calendar/testsuites/calendar_bench.json) | 60 | Event CRUD, recurring events, free/busy, ACL, lifecycle |
+| Linear | [linear_bench.json](examples/linear/testsuites/linear_bench.json) | 57 | Issues, labels, comments, workflow states, teams |
+| Slack | [slack_bench.json](examples/slack/testsuites/slack_bench.json) | 59 | Messages, channels, reactions, threading |
 
-<img width="2985" height="1966" alt="pass_rates_annotated" src="https://github.com/user-attachments/assets/f5c59c81-c3bd-427e-977c-a5c2c0695e86" />
-
-- **[Evaluation DSL](docs/evaluation-dsl.md)** - Check DSL docs on how it works.
+Each test defines expected state changes via declarative assertions. See the [assertions docs](https://agentdiff.mintlify.app/core-concepts/assertions) for how they work.
 
 <img width="2516" height="1020" alt="image" src="https://github.com/user-attachments/assets/3270f1f1-5afa-4db2-97b0-c35c070ef44f" />
 
+## Documentation
 
-### To run evaluations:
-
-```python
-from agent_diff import AgentDiff, PythonExecutorProxy, BashExecutorProxy, create_openai_tool
-from agents import Agent, Runner
-
-client = AgentDiff()
-
-
-suite_list = client.list_test_suites(name="Slack Bench")
-slack_suite = suite_list.testSuites[0]
-suite = client.get_test_suite(slack_suite.id, expand=True)
-
-evaluation_results = []
-
-for test in suite.tests:
-    prompt = test.prompt
-    test_id = test.id
-
-    #In test suite you define which env seed template is used for each test
-    env = client.init_env(testId=test_id)
-
-    # This function will take a snapshot before run
-    run = client.start_run(envId=env.environmentId, testId=test_id)
-
-
-    bash_executor = BashExecutorProxy(env.environmentId)  # Auto-loads from env vars
-    bash_tool = create_openai_tool(bash_executor)
-
-    agent = Agent(
-        name="Slack Assistant",
-        instructions="Use execute_bash tool with curl to interact with Slack API at https://slack.com/api/*. Authentication is handled automatically.",
-        tools=[bash_tool]
-    )
-
-    response = await Runner.run(agent, prompt)
+- **[Python SDK](https://agentdiff.mintlify.app/sdks/python/installation)** — Full Python SDK reference
+- **[TypeScript SDK](https://agentdiff.mintlify.app/sdks/typescript/installation)** — Full TypeScript SDK reference
+- **[Assertions & Evaluation DSL](https://agentdiff.mintlify.app/core-concepts/assertions)** — Write test assertions
+- **[API Reference](https://agentdiff.mintlify.app/api-reference/introduction)** — REST API documentation
+- **[Self-Hosting](https://agentdiff.mintlify.app/hosting/docker-setup)** — Docker setup & configuration
 
-    #This function will take a 2nd snapshot, run diff and assert results against expected state defined in test suite
-    
-    #computes eval
-    client.evaluate_run(runId=run.runId)
-    
-    #returns score runId, full diff and score (0/1)
-    run_result = client.get_results_for_run(runId=run.runId)
+## Citation
 
-    evaluation_results.append(run_result) 
+If you use Agent-Diff in your research, please cite:
 
-    client.delete_env(envId=env.environmentId)
+```bibtex
+@article{pysklo2025agentdiff,
+  title={Agent-Diff: Benchmarking LLM Agents on Enterprise API Tasks via Code Execution with State-Diff-Based Evaluation},
+  author={Pysklo, Hubert M. and Zhuravel, Artem and Watson, Patrick D.},
+  journal={arXiv preprint arXiv:2602.11224},
+  year={2025}
+}
 ```
 
-### Example output:
-
-<img width="1669" height="878" alt="image" src="https://github.com/user-attachments/assets/096393d2-e464-4a3d-b0a8-b188af5cf8a9" />
-
-
-## Documentation
-
-- **[Python SDK](sdk/agent-diff-python/README.md)** - Complete Python SDK reference
-- **[TS SDK](sdk/agent-diff-ts/README.md)** - Complete TS SDK reference
-- **[Evaluation DSL](docs/evaluation-dsl.md)** - Write test assertions
-- **[API Reference](docs/api-reference.md)** - REST API documentation
-
diff --git a/examples/langchain_agent_benchmark.ipynb b/examples/langchain_agent_benchmark.ipynb
index f9c6172..ceb1857 100644
--- a/examples/langchain_agent_benchmark.ipynb
+++ b/examples/langchain_agent_benchmark.ipynb
@@ -8,9 +8,11 @@
     "\n",
     "Run the [Agent-Diff benchmark](https://arxiv.org/abs/2602.11224) using LangChain's built-in agent with tool calling.\n",
     "\n",
-    "Unlike the [ReAct notebook](react_agent_benchmark.ipynb) which uses a custom XML-tag loop, this notebook lets LangChain handle the agent loop via the model's native function-calling protocol. The `BashExecutorProxy` from the `agent-diff` SDK is wrapped as a LangChain tool.\n",
+    "Unlike the [ReAct notebook](react_agent_benchmark.ipynb) which uses a custom XML-tag loop, this notebook lets LangChain handle the agent loop via the model's native function-calling protocol.\n",
     "\n",
-    "All 4 services (Box, Calendar, Linear, Slack) are evaluated across 224 tasks.\n",
+    "Two options are shown:\n",
+    "- **Option A** — Load tests from HuggingFace dataset (no server-side test suites needed)\n",
+    "- **Option B** — Load tests from Agent-Diff server test suites (used in production evaluations)\n",
     "\n",
     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/langchain_agent_benchmark.ipynb)\n",
     "\n",
@@ -23,7 +25,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install agent-diff langchain langchain-openai tqdm pandas -q"
+    "!pip install agent-diff langchain langchain-openai datasets -q"
    ]
   },
   {
@@ -32,22 +34,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "from getpass import getpass\n",
-    "\n",
-    "if not os.environ.get(\"AGENT_DIFF_API_KEY\"):\n",
-    "    os.environ[\"AGENT_DIFF_API_KEY\"] = getpass(\"Agent-Diff API key: \")\n",
-    "\n",
-    "if not os.environ.get(\"AGENT_DIFF_BASE_URL\"):\n",
-    "    os.environ[\"AGENT_DIFF_BASE_URL\"] = \"https://api.agentdiff.dev\"\n",
-    "\n",
-    "OPENROUTER_API_KEY = os.environ.get(\"OPENROUTER_API_KEY\") or getpass(\"OpenRouter API key: \")\n",
-    "\n",
-    "# --- Settings ---\n",
-    "MODEL = \"deepseek/deepseek-chat-v3-0324\"  # change to any OpenRouter model\n",
-    "MAX_ITERATIONS = 40       # max agent loop turns per task\n",
-    "MAX_TESTS = None          # None = run all tests; set to e.g. 5 for a quick trial\n",
-    "TIMEOUT_SECONDS = 480     # per-test timeout"
+    "# Get your API key at https://www.agentdiff.dev/dashboard\n",
+    "%env AGENT_DIFF_API_KEY=\n",
+    "%env AGENT_DIFF_BASE_URL=https://api.agentdiff.dev\n",
+    "# OpenRouter key (or any OpenAI-compatible provider) e.g. https://openrouter.ai/anthropic/claude-haiku-4.5\n",
+    "%env OPENAI_API_KEY="
    ]
   },
   {
@@ -56,104 +47,32 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "SERVICE_CONFIG = {\n",
-    "    \"slack\": {\n",
-    "        \"name\": \"Slack\",\n",
-    "        \"base_url\": \"https://slack.com/api\",\n",
-    "        \"description\": \"Slack workspace messaging and collaboration API\",\n",
-    "        \"extra_context\": \"\",\n",
-    "        \"test_suite_name\": \"Slack Bench v2\",\n",
-    "    },\n",
-    "    \"box\": {\n",
-    "        \"name\": \"Box\",\n",
-    "        \"base_url\": \"https://api.box.com/2.0\",\n",
-    "        \"description\": \"Box cloud storage and file management API\",\n",
-    "        \"extra_context\": \"\",\n",
-    "        \"test_suite_name\": \"Box Bench v2\",\n",
-    "    },\n",
-    "    \"calendar\": {\n",
-    "        \"name\": \"Google Calendar\",\n",
-    "        \"base_url\": \"https://www.googleapis.com/calendar/v3\",\n",
-    "        \"description\": \"Google Calendar scheduling and events API\",\n",
-    "        \"extra_context\": \"Current Date/Time: Sunday, June 17, 2018 at 00:01 (midnight), timezone America/Los_Angeles. Use this as the reference point for all relative date/time expressions like 'today', 'tomorrow', 'this Saturday', etc.\",\n",
-    "        \"test_suite_name\": \"Calendar Bench\",\n",
-    "    },\n",
-    "    \"linear\": {\n",
-    "        \"name\": \"Linear\",\n",
-    "        \"base_url\": \"https://api.linear.app/graphql\",\n",
-    "        \"description\": \"Linear project management and issue tracking API\",\n",
-    "        \"extra_context\": \"\",\n",
-    "        \"test_suite_name\": \"Linear Bench\",\n",
-    "    },\n",
-    "}\n",
-    "\n",
-    "SYSTEM_PROMPT_TEMPLATE = \"\"\"You are an AI assistant that completes tasks by interacting with APIs via bash commands.\n",
-    "\n",
-    "Current Session:\n",
-    "- Service: {service_name}\n",
-    "- Base URL: {base_url}\n",
-    "- Description: {service_description}\n",
-    "{extra_context}\n",
+    "import time\n",
+    "import json\n",
+    "from agent_diff import AgentDiff, PythonExecutorProxy, create_langchain_tool\n",
+    "from langchain.agents import create_agent\n",
+    "from langchain_openai import ChatOpenAI\n",
     "\n",
-    "Environment:\n",
-    "- You are authenticated as a user in the {service_name} workspace/account.\n",
-    "- Authentication is handled automatically via proxy. Use placeholder tokens like <TOKEN> where credentials would go.\n",
-    "- Use the execute_bash tool to run bash commands (primarily curl) to interact with the {service_name} API.\n",
-    "- If you are not sure how to use the {service_name} API, explore the endpoint, parameters, and learn how it works.\n",
-    "- Parse API responses carefully - extract IDs and data needed for subsequent calls.\n",
-    "- If a command fails, analyze the error and try a different approach.\n",
-    "- Only declare completion when the task is fully completed (not just when you've gathered information).\n",
-    "\"\"\"\n",
+    "client = AgentDiff()\n",
     "\n",
+    "model = ChatOpenAI(\n",
+    "    model=\"anthropic/claude-haiku-4.5\",\n",
+    "    base_url=\"https://openrouter.ai/api/v1\",\n",
+    ")\n",
     "\n",
-    "def build_system_prompt(service: str) -> str:\n",
-    "    config = SERVICE_CONFIG[service]\n",
-    "    return SYSTEM_PROMPT_TEMPLATE.format(\n",
-    "        service_name=config[\"name\"],\n",
-    "        base_url=config[\"base_url\"],\n",
-    "        service_description=config[\"description\"],\n",
-    "        extra_context=config[\"extra_context\"],\n",
-    "    )"
+    "SERVICE_PROMPTS = {\n",
+    "    \"slack\": \"Use execute_python to interact with Slack API at https://slack.com/api. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n",
+    "    \"box\": \"Use execute_python to interact with Box API at https://api.box.com/2.0. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n",
+    "    \"calendar\": \"Use execute_python to interact with Google Calendar API at https://www.googleapis.com/calendar/v3. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token. Current Date/Time: Sunday, June 17, 2018 at 00:01 (midnight), timezone America/Los_Angeles.\",\n",
+    "    \"linear\": \"Use execute_python to interact with Linear GraphQL API at https://api.linear.app/graphql. Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token.\",\n",
+    "}"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "import time\n",
-    "from langchain_openai import ChatOpenAI\n",
-    "from langchain.agents import AgentExecutor, create_tool_calling_agent\n",
-    "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
-    "from agent_diff import AgentDiff, BashExecutorProxy, create_langchain_tool\n",
-    "\n",
-    "\n",
-    "def create_agent(service: str, bash_executor: BashExecutorProxy, model: str) -> AgentExecutor:\n",
-    "    \"\"\"Create a LangChain agent with the bash tool for a given service.\"\"\"\n",
-    "    llm = ChatOpenAI(\n",
-    "        base_url=\"https://openrouter.ai/api/v1\",\n",
-    "        api_key=OPENROUTER_API_KEY,\n",
-    "        model=model,\n",
-    "        temperature=0,\n",
-    "    )\n",
-    "    tool = create_langchain_tool(bash_executor)\n",
-    "    system_prompt = build_system_prompt(service)\n",
-    "\n",
-    "    prompt = ChatPromptTemplate.from_messages([\n",
-    "        (\"system\", system_prompt),\n",
-    "        (\"human\", \"{input}\"),\n",
-    "        MessagesPlaceholder(variable_name=\"agent_scratchpad\"),\n",
-    "    ])\n",
-    "\n",
-    "    agent = create_tool_calling_agent(llm, [tool], prompt)\n",
-    "    return AgentExecutor(\n",
-    "        agent=agent,\n",
-    "        tools=[tool],\n",
-    "        max_iterations=MAX_ITERATIONS,\n",
-    "        handle_parsing_errors=True,\n",
-    "        verbose=False,\n",
-    "    )"
+    "## Option A: Load from HuggingFace Dataset"
    ]
   },
   {
@@ -162,89 +81,69 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from tqdm.auto import tqdm\n",
+    "from datasets import load_dataset\n",
     "\n",
+    "dataset = load_dataset(\"hubertmarek/agent-diff-bench\", split=\"test\")\n",
+    "results = []\n",
     "\n",
-    "def run_single_test(client: AgentDiff, model: str, test, service: str) -> dict:\n",
-    "    \"\"\"Run one test: init env -> LangChain agent -> evaluate -> cleanup.\"\"\"\n",
-    "    env = None\n",
-    "    try:\n",
-    "        env = client.init_env(testId=test.id)\n",
-    "        run = client.start_run(envId=env.environmentId, testId=test.id)\n",
-    "        bash_executor = BashExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n",
-    "\n",
-    "        agent_executor = create_agent(service, bash_executor, model)\n",
-    "\n",
-    "        start = time.perf_counter()\n",
-    "        agent_output = agent_executor.invoke({\"input\": test.prompt})\n",
-    "        elapsed = time.perf_counter() - start\n",
-    "\n",
-    "        client.evaluate_run(runId=run.runId)\n",
-    "        result = client.get_results_for_run(runId=run.runId)\n",
-    "        client.delete_env(envId=env.environmentId)\n",
+    "for example in dataset.select(range(5)):  # First 5 tasks; remove .select() for full benchmark\n",
+    "    info = json.loads(example[\"info\"]) if isinstance(example[\"info\"], str) else example[\"info\"]\n",
+    "    expected = json.loads(example[\"answer\"]) if isinstance(example[\"answer\"], str) else example[\"answer\"]\n",
+    "    service = info[\"service\"]\n",
     "\n",
-    "        return {\n",
-    "            \"test_id\": str(test.id),\n",
-    "            \"test_name\": getattr(test, \"name\", \"\"),\n",
-    "            \"passed\": result.passed,\n",
-    "            \"score\": result.score.get(\"percent\", 0) if isinstance(result.score, dict) else 0,\n",
-    "            \"failures\": result.failures,\n",
-    "            \"time\": round(elapsed, 2),\n",
-    "            \"agent_output\": agent_output.get(\"output\", \"\"),\n",
-    "        }\n",
-    "    except Exception as e:\n",
-    "        if env:\n",
-    "            try:\n",
-    "                client.delete_env(envId=env.environmentId)\n",
-    "            except Exception:\n",
-    "                pass\n",
-    "        return {\"test_id\": str(test.id), \"test_name\": getattr(test, \"name\", \"\"), \"passed\": False, \"score\": 0, \"error\": str(e)}\n",
+    "    print(f\"Running: {example.get('test_name', example['test_id'])}\")\n",
     "\n",
+    "    env = client.init_env(\n",
+    "        templateService=info[\"service\"],\n",
+    "        templateName=info[\"seed_template\"],\n",
+    "        impersonateUserId=info[\"impersonate_user_id\"],\n",
+    "    )\n",
+    "    run = client.start_run(envId=env.environmentId)\n",
     "\n",
-    "def run_benchmark(model: str, services: list[str] | None = None, max_tests: int | None = None) -> list[dict]:\n",
-    "    \"\"\"Run the full benchmark across services using LangChain agent.\"\"\"\n",
-    "    services = services or list(SERVICE_CONFIG.keys())\n",
-    "    client = AgentDiff()\n",
-    "    all_results = []\n",
+    "    python_tool = create_langchain_tool(\n",
+    "        PythonExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n",
+    "    )\n",
     "\n",
-    "    for service in services:\n",
-    "        config = SERVICE_CONFIG[service]\n",
+    "    agent = create_agent(\n",
+    "        model=model,\n",
+    "        tools=[python_tool],\n",
+    "        system_prompt=SERVICE_PROMPTS[service],\n",
+    "    )\n",
     "\n",
-    "        suite_list = client.list_test_suites(name=config[\"test_suite_name\"])\n",
-    "        if not suite_list.testSuites:\n",
-    "            print(f\"[SKIP] Test suite '{config['test_suite_name']}' not found.\")\n",
-    "            continue\n",
-    "        suite = client.get_test_suite(suite_list.testSuites[0].id, expand=True)\n",
-    "        tests = suite.tests[:max_tests] if max_tests else suite.tests\n",
+    "    start = time.perf_counter()\n",
+    "    try:\n",
+    "        response = agent.invoke({\"messages\": [\n",
+    "            {\"role\": \"user\", \"content\": example[\"question\"]}\n",
+    "        ]})\n",
+    "    except Exception as e:\n",
+    "        response = {\"error\": str(e)}\n",
+    "    elapsed = time.perf_counter() - start\n",
     "\n",
-    "        print(f\"\\n{'='*60}\")\n",
-    "        print(f\"  {config['name']} — {len(tests)} tests | model: {model}\")\n",
-    "        print(f\"{'='*60}\")\n",
+    "    client.evaluate_run(runId=run.runId, expectedOutput=expected)\n",
+    "    result = client.get_results_for_run(runId=run.runId)\n",
     "\n",
-    "        for test in tqdm(tests, desc=config[\"name\"]):\n",
-    "            result = run_single_test(client, model, test, service)\n",
-    "            result[\"service\"] = service\n",
-    "            result[\"model\"] = model\n",
-    "            all_results.append(result)\n",
+    "    results.append({\n",
+    "        \"test_id\": example[\"test_id\"],\n",
+    "        \"service\": service,\n",
+    "        \"passed\": result.passed,\n",
+    "        \"score\": result.score,\n",
+    "        \"time\": round(elapsed, 1),\n",
+    "    })\n",
+    "    print(f\"  {'PASS' if result.passed else 'FAIL'} | score={result.score} | {elapsed:.1f}s\")\n",
     "\n",
-    "            status = \"PASS\" if result.get(\"passed\") else \"FAIL\"\n",
-    "            score = result.get(\"score\", 0)\n",
-    "            tqdm.write(f\"  [{status}] {result.get('test_name', result['test_id'])[:60]}  score={score}\")\n",
+    "    client.delete_env(envId=env.environmentId)\n",
     "\n",
-    "    return all_results"
+    "passed = sum(1 for r in results if r[\"passed\"])\n",
+    "print(f\"\\nResults: {passed}/{len(results)} passed\")"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "results = run_benchmark(\n",
-    "    model=MODEL,\n",
-    "    services=None,          # all 4 services; or e.g. [\"slack\", \"box\"]\n",
-    "    max_tests=MAX_TESTS,\n",
-    ")"
+    "## Option B: Load from Server Test Suites\n",
+    "\n",
+    "Uses the Agent-Diff platform's test suite API. Assertions are defined server-side so you don't need to pass `expectedOutput` — just call `evaluate_run`. Available test suites: [docs](https://agentdiff.mintlify.app/test-suites/benchmarks)."
    ]
   },
   {
@@ -253,32 +152,63 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
+    "SUITES = [\"Slack Bench v2\", \"Box Bench v2\", \"Calendar Bench\", \"Linear Bench\"]\n",
     "\n",
-    "df = pd.DataFrame(results)\n",
+    "results = []\n",
     "\n",
-    "print(\"\\n\" + \"=\" * 60)\n",
-    "print(f\"  Results: {MODEL} (LangChain Agent)\")\n",
-    "print(\"=\" * 60)\n",
+    "for suite_name in SUITES:\n",
+    "    suite_list = client.list_test_suites(name=suite_name)\n",
+    "    if not suite_list.testSuites:\n",
+    "        print(f\"[SKIP] '{suite_name}' not found\")\n",
+    "        continue\n",
+    "    suite = client.get_test_suite(suite_list.testSuites[0].id, expand=True)\n",
+    "    tests = suite.tests[:5]  # First 5 tests per suite; remove [:5] for full benchmark\n",
     "\n",
-    "if \"service\" in df.columns and \"score\" in df.columns:\n",
-    "    summary = df.groupby(\"service\").agg(\n",
-    "        tests=(\"score\", \"count\"),\n",
-    "        passed=(\"passed\", \"sum\"),\n",
-    "        mean_score=(\"score\", \"mean\"),\n",
-    "        pass_rate=(\"passed\", \"mean\"),\n",
-    "    ).round(2)\n",
-    "    summary[\"pass_rate\"] = (summary[\"pass_rate\"] * 100).round(1)\n",
-    "    print(\"\\nPer-service summary:\")\n",
-    "    print(summary.to_string())\n",
+    "    print(f\"\\n{'='*50}\")\n",
+    "    print(f\"  {suite_name} — {len(tests)} tests\")\n",
+    "    print(f\"{'='*50}\")\n",
     "\n",
-    "    overall_score = df[\"score\"].mean()\n",
-    "    overall_pass = df[\"passed\"].mean() * 100\n",
-    "    print(f\"\\nOverall: score={overall_score:.1f}  pass_rate={overall_pass:.1f}%\")\n",
+    "    for test in tests:\n",
+    "        env = client.init_env(testId=test.id)\n",
+    "        run = client.start_run(envId=env.environmentId, testId=test.id)\n",
+    "\n",
+    "        python_tool = create_langchain_tool(\n",
+    "            PythonExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n",
+    "        )\n",
+    "\n",
+    "        service = env.service\n",
+    "        agent = create_agent(\n",
+    "            model=model,\n",
+    "            tools=[python_tool],\n",
+    "            system_prompt=SERVICE_PROMPTS.get(service, SERVICE_PROMPTS[\"slack\"]),\n",
+    "        )\n",
+    "\n",
+    "        start = time.perf_counter()\n",
+    "        try:\n",
+    "            response = agent.invoke({\"messages\": [\n",
+    "                {\"role\": \"user\", \"content\": test.prompt}\n",
+    "            ]})\n",
+    "        except Exception as e:\n",
+    "            response = {\"error\": str(e)}\n",
+    "        elapsed = time.perf_counter() - start\n",
+    "\n",
+    "        client.evaluate_run(runId=run.runId)\n",
+    "        result = client.get_results_for_run(runId=run.runId)\n",
+    "\n",
+    "        results.append({\n",
+    "            \"test_id\": str(test.id),\n",
+    "            \"suite\": suite_name,\n",
+    "            \"passed\": result.passed,\n",
+    "            \"score\": result.score,\n",
+    "            \"time\": round(elapsed, 1),\n",
+    "        })\n",
+    "        status = \"PASS\" if result.passed else \"FAIL\"\n",
+    "        print(f\"  [{status}] {getattr(test, 'name', str(test.id))[:60]}  score={result.score} | {elapsed:.1f}s\")\n",
+    "\n",
+    "        client.delete_env(envId=env.environmentId)\n",
     "\n",
-    "    summary[\"mean_score\"].plot.bar(title=f\"Agent-Diff Score by Service ({MODEL}, LangChain)\", ylabel=\"Score\", xlabel=\"Service\", rot=0)\n",
-    "else:\n",
-    "    print(df)"
+    "passed = sum(1 for r in results if r[\"passed\"])\n",
+    "print(f\"\\nResults: {passed}/{len(results)} passed\")"
    ]
   }
  ],
diff --git a/examples/react_agent_benchmark.ipynb b/examples/react_agent_benchmark.ipynb
index e6c0d6d..14d2656 100644
--- a/examples/react_agent_benchmark.ipynb
+++ b/examples/react_agent_benchmark.ipynb
@@ -10,8 +10,6 @@
     "\n",
     "The agent reasons step-by-step (`<thinking>`), executes bash/curl commands (`<action>`), observes the result, and repeats until the task is done (`<done>`).\n",
     "\n",
-    "All 4 services (Box, Calendar, Linear, Slack) are evaluated across 224 tasks.\n",
-    "\n",
     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/react_agent_benchmark.ipynb)\n",
     "\n",
     "**Links:** [Paper](https://arxiv.org/abs/2602.11224) | [Dataset](https://huggingface.co/datasets/hubertmarek/agent-diff-bench) | [GitHub](https://github.com/agent-diff-bench/agent-diff)"
@@ -23,7 +21,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install agent-diff httpx tqdm pandas -q"
+    "!pip install agent-diff httpx datasets -q"
    ]
   },
   {
@@ -32,22 +30,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "from getpass import getpass\n",
-    "\n",
-    "if not os.environ.get(\"AGENT_DIFF_API_KEY\"):\n",
-    "    os.environ[\"AGENT_DIFF_API_KEY\"] = getpass(\"Agent-Diff API key: \")\n",
-    "\n",
-    "if not os.environ.get(\"AGENT_DIFF_BASE_URL\"):\n",
-    "    os.environ[\"AGENT_DIFF_BASE_URL\"] = \"https://api.agentdiff.dev\"\n",
-    "\n",
-    "OPENROUTER_API_KEY = os.environ.get(\"OPENROUTER_API_KEY\") or getpass(\"OpenRouter API key: \")\n",
-    "\n",
-    "# --- Settings ---\n",
-    "MODEL = \"deepseek/deepseek-chat-v3-0324\"  # change to any OpenRouter model\n",
-    "MAX_ITERATIONS = 40       # max ReAct loop turns per task\n",
-    "MAX_TESTS = None          # None = run all tests; set to e.g. 5 for a quick trial\n",
-    "TIMEOUT_SECONDS = 480     # per-test timeout"
+    "%env AGENT_DIFF_API_KEY=\n",
+    "%env AGENT_DIFF_BASE_URL=https://api.agentdiff.dev\n",
+    "# OpenRouter key (or any OpenAI-compatible provider) e.g. https://openrouter.ai/deepseek/deepseek-chat-v3-0324\n",
+    "%env OPENROUTER_API_KEY="
    ]
   },
   {
@@ -56,38 +42,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
     "import re\n",
-    "from typing import Optional, Tuple\n",
+    "import time\n",
+    "import json\n",
+    "import httpx\n",
+    "import random\n",
+    "from datasets import load_dataset\n",
+    "from agent_diff import AgentDiff, BashExecutorProxy\n",
+    "\n",
+    "OPENROUTER_API_KEY = os.environ[\"OPENROUTER_API_KEY\"]\n",
+    "MODEL = \"deepseek/deepseek-chat-v3-0324\"\n",
     "\n",
     "SERVICE_CONFIG = {\n",
-    "    \"slack\": {\n",
-    "        \"name\": \"Slack\",\n",
-    "        \"base_url\": \"https://slack.com/api\",\n",
-    "        \"description\": \"Slack workspace messaging and collaboration API\",\n",
-    "        \"extra_context\": \"\",\n",
-    "        \"test_suite_name\": \"Slack Bench v2\",\n",
-    "    },\n",
-    "    \"box\": {\n",
-    "        \"name\": \"Box\",\n",
-    "        \"base_url\": \"https://api.box.com/2.0\",\n",
-    "        \"description\": \"Box cloud storage and file management API\",\n",
-    "        \"extra_context\": \"\",\n",
-    "        \"test_suite_name\": \"Box Bench v2\",\n",
-    "    },\n",
-    "    \"calendar\": {\n",
-    "        \"name\": \"Google Calendar\",\n",
-    "        \"base_url\": \"https://www.googleapis.com/calendar/v3\",\n",
-    "        \"description\": \"Google Calendar scheduling and events API\",\n",
-    "        \"extra_context\": \"- **Current Date/Time**: Sunday, June 17, 2018 at 00:01 (midnight), timezone America/Los_Angeles. Use this as the reference point for all relative date/time expressions like 'today', 'tomorrow', 'this Saturday', etc.\",\n",
-    "        \"test_suite_name\": \"Calendar Bench\",\n",
-    "    },\n",
-    "    \"linear\": {\n",
-    "        \"name\": \"Linear\",\n",
-    "        \"base_url\": \"https://api.linear.app/graphql\",\n",
-    "        \"description\": \"Linear project management and issue tracking API\",\n",
-    "        \"extra_context\": \"\",\n",
-    "        \"test_suite_name\": \"Linear Bench\",\n",
-    "    },\n",
+    "    \"slack\": {\"name\": \"Slack\", \"base_url\": \"https://slack.com/api\", \"extra\": \"\"},\n",
+    "    \"box\": {\"name\": \"Box\", \"base_url\": \"https://api.box.com/2.0\", \"extra\": \"\"},\n",
+    "    \"calendar\": {\"name\": \"Google Calendar\", \"base_url\": \"https://www.googleapis.com/calendar/v3\",\n",
+    "                 \"extra\": \"- **Current Date/Time**: Sunday, June 17, 2018 at 00:01 (midnight), timezone America/Los_Angeles.\\n\"},\n",
+    "    \"linear\": {\"name\": \"Linear\", \"base_url\": \"https://api.linear.app/graphql\", \"extra\": \"\"},\n",
     "}\n",
     "\n",
     "REACT_SYSTEM_PROMPT = \"\"\"You are an AI assistant that completes tasks by interacting with APIs via bash commands.\n",
@@ -95,159 +67,80 @@
     "## Current Session\n",
     "- **Service**: {service_name}\n",
     "- **Base URL**: {base_url}\n",
-    "- **Description**: {service_description}\n",
     "{extra_context}\n",
     "\n",
     "## Environment\n",
-    "- You are authenticated as a user in the {service_name} workspace/account.\n",
-    "- Authentication is handled automatically via proxy. Use placeholder tokens like `<TOKEN>` where credentials would go.\n",
-    "- You execute bash commands (primarily curl) to interact with the {service_name} API.\n",
-    "- If you are not sure how to use {service_name} API, explore the endpoint, parameters, and learn how it works.\n",
-    "- The environment is stateless between commands - you cannot install packages or persist files.\n",
+    "- Authentication is handled automatically via proxy. Use placeholder tokens where credentials would go.\n",
+    "- You execute bash commands (primarily curl) to interact with the API.\n",
+    "- If you are not sure how to use the API, explore the endpoint, parameters, and learn how it works.\n",
     "\n",
     "## Response Format\n",
-    "You must respond using XML tags. Think step-by-step, then execute a command OR declare completion.\n",
-    "\n",
-    "**To execute a bash command:**\n",
-    "<thinking>\n",
-    "Your reasoning about what needs to be done and why this command will help.\n",
-    "</thinking>\n",
-    "\n",
-    "<action>\n",
-    "Your bash command here (e.g., curl request)\n",
-    "</action>\n",
+    "Respond using XML tags:\n",
     "\n",
-    "**When the task is complete:**\n",
-    "<thinking>\n",
-    "Your reasoning confirming the task is done based on API responses.\n",
-    "</thinking>\n",
+    "<thinking>Your reasoning</thinking>\n",
+    "<action>Your bash command</action>\n",
     "\n",
-    "<done>\n",
-    "Brief summary of what was accomplished.\n",
-    "</done>\n",
+    "When done:\n",
+    "<thinking>Your reasoning</thinking>\n",
+    "<done>Brief summary</done>\n",
     "\n",
     "## Rules\n",
     "1. Execute ONE command at a time, then wait for the result.\n",
     "2. Parse API responses carefully - extract IDs and data needed for subsequent calls.\n",
     "3. If a command fails, analyze the error and try a different approach.\n",
-    "4. Only use <done> when the task is fully completed (not just when you've gathered information).\n",
+    "4. Only use <done> when the task is fully completed.\n",
     "\"\"\"\n",
     "\n",
     "\n",
-    "def build_system_prompt(service: str) -> str:\n",
-    "    config = SERVICE_CONFIG[service]\n",
-    "    return REACT_SYSTEM_PROMPT.format(\n",
-    "        service_name=config[\"name\"],\n",
-    "        base_url=config[\"base_url\"],\n",
-    "        service_description=config[\"description\"],\n",
-    "        extra_context=config[\"extra_context\"],\n",
-    "    )\n",
-    "\n",
-    "\n",
-    "def parse_react_response(response: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:\n",
-    "    \"\"\"Parse ReAct XML response. Returns (thinking, action, done).\"\"\"\n",
-    "    thinking_match = re.search(r'<thinking>(.*?)</thinking>', response, re.DOTALL)\n",
-    "    action_match = re.search(r'<action>(.*?)</action>', response, re.DOTALL)\n",
-    "    done_match = re.search(r'<done>(.*?)</done>', response, re.DOTALL)\n",
-    "    thinking = thinking_match.group(1).strip() if thinking_match else None\n",
-    "    action = action_match.group(1).strip() if action_match else None\n",
-    "    done = done_match.group(1).strip() if done_match else None\n",
-    "    return thinking, action, done"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "import httpx\n",
-    "from agent_diff import AgentDiff, BashExecutorProxy\n",
-    "\n",
-    "\n",
-    "def call_openrouter(model: str, messages: list, max_retries: int = 3) -> dict:\n",
-    "    \"\"\"Call OpenRouter chat completions API with retry logic.\"\"\"\n",
-    "    import random\n",
-    "    last_error = None\n",
+    "def call_openrouter(model, messages, max_retries=3):\n",
     "    for attempt in range(max_retries):\n",
     "        try:\n",
     "            with httpx.Client(timeout=120) as http:\n",
     "                resp = http.post(\n",
     "                    \"https://openrouter.ai/api/v1/chat/completions\",\n",
-    "                    headers={\"Authorization\": f\"Bearer {OPENROUTER_API_KEY}\", \"Content-Type\": \"application/json\"},\n",
+    "                    headers={\"Authorization\": f\"Bearer {OPENROUTER_API_KEY}\"},\n",
     "                    json={\"model\": model, \"messages\": messages},\n",
     "                )\n",
     "                resp.raise_for_status()\n",
-    "                data = resp.json()\n",
-    "                choice = data[\"choices\"][0]\n",
-    "                usage = data.get(\"usage\", {})\n",
-    "                return {\n",
-    "                    \"content\": choice[\"message\"][\"content\"],\n",
-    "                    \"usage\": {\n",
-    "                        \"prompt_tokens\": usage.get(\"prompt_tokens\", 0),\n",
-    "                        \"completion_tokens\": usage.get(\"completion_tokens\", 0),\n",
-    "                        \"total_tokens\": usage.get(\"total_tokens\", 0),\n",
-    "                        \"cost\": usage.get(\"cost\", 0.0),\n",
-    "                    },\n",
-    "                }\n",
-    "        except (httpx.HTTPStatusError, httpx.ConnectError, httpx.ReadError) as e:\n",
-    "            last_error = e\n",
-    "            should_retry = not isinstance(e, httpx.HTTPStatusError) or e.response.status_code in (429, 500, 502, 503, 504)\n",
-    "            if should_retry and attempt < max_retries - 1:\n",
-    "                delay = 2 * (2 ** attempt) + random.uniform(0, 1)\n",
-    "                print(f\"  [RETRY] attempt {attempt+1}: {e}. Waiting {delay:.1f}s...\")\n",
-    "                time.sleep(delay)\n",
+    "                return resp.json()[\"choices\"][0][\"message\"][\"content\"]\n",
+    "        except (httpx.HTTPStatusError, httpx.ConnectError) as e:\n",
+    "            if attempt < max_retries - 1:\n",
+    "                time.sleep(2 * (2 ** attempt) + random.uniform(0, 1))\n",
     "                continue\n",
     "            raise\n",
-    "    raise last_error\n",
-    "\n",
     "\n",
-    "def run_react_agent(model: str, task_prompt: str, bash_executor: BashExecutorProxy, system_prompt: str, max_iterations: int = 40) -> dict:\n",
-    "    \"\"\"Run the ReAct agent loop: think -> act -> observe -> repeat.\"\"\"\n",
-    "    messages = [\n",
-    "        {\"role\": \"system\", \"content\": system_prompt},\n",
-    "        {\"role\": \"user\", \"content\": f\"Task: {task_prompt}\"},\n",
-    "    ]\n",
-    "    steps = []\n",
-    "    total_usage = {\"prompt_tokens\": 0, \"completion_tokens\": 0, \"total_tokens\": 0, \"cost\": 0.0}\n",
     "\n",
-    "    for iteration in range(max_iterations):\n",
-    "        try:\n",
-    "            api_resp = call_openrouter(model, messages)\n",
-    "        except Exception as e:\n",
-    "            steps.append({\"iteration\": iteration + 1, \"error\": str(e)})\n",
-    "            break\n",
-    "\n",
-    "        response_text = api_resp[\"content\"]\n",
-    "        for k in total_usage:\n",
-    "            total_usage[k] += api_resp[\"usage\"].get(k, 0)\n",
+    "def parse_react(response):\n",
+    "    thinking = re.search(r'<thinking>(.*?)</thinking>', response, re.DOTALL)\n",
+    "    action = re.search(r'<action>(.*?)</action>', response, re.DOTALL)\n",
+    "    done = re.search(r'<done>(.*?)</done>', response, re.DOTALL)\n",
+    "    return (\n",
+    "        thinking.group(1).strip() if thinking else None,\n",
+    "        action.group(1).strip() if action else None,\n",
+    "        done.group(1).strip() if done else None,\n",
+    "    )\n",
     "\n",
-    "        thinking, action, done = parse_react_response(response_text)\n",
     "\n",
+    "def run_react_agent(model, prompt, bash, system_prompt, max_iterations=40):\n",
+    "    messages = [{\"role\": \"system\", \"content\": system_prompt}, {\"role\": \"user\", \"content\": prompt}]\n",
+    "    for i in range(max_iterations):\n",
+    "        text = call_openrouter(model, messages)\n",
+    "        _, action, done = parse_react(text)\n",
     "        if action:\n",
-    "            try:\n",
-    "                result = bash_executor.execute(action)\n",
-    "                observation = {\"stdout\": result.get(\"stdout\", \"\"), \"stderr\": result.get(\"stderr\", \"\"), \"exit_code\": result.get(\"exit_code\", 0)} if isinstance(result, dict) else {\"stdout\": str(result), \"stderr\": \"\", \"exit_code\": 0}\n",
-    "            except Exception as e:\n",
-    "                observation = {\"stdout\": \"\", \"stderr\": str(e), \"exit_code\": 1}\n",
-    "\n",
-    "            steps.append({\"iteration\": iteration + 1, \"thinking\": thinking, \"action\": action, \"observation\": observation})\n",
-    "\n",
-    "            obs_text = observation[\"stdout\"].strip() or \"(empty output)\"\n",
-    "            if observation.get(\"exit_code\", 0) != 0:\n",
-    "                obs_text = f\"{observation['stdout']}\\n[stderr]: {observation['stderr']}\\n[exit_code]: {observation['exit_code']}\".strip()\n",
-    "\n",
-    "            messages.append({\"role\": \"assistant\", \"content\": response_text})\n",
-    "            messages.append({\"role\": \"user\", \"content\": f\"<observation>\\n{obs_text}\\n</observation>\"})\n",
-    "\n",
+    "            result = bash.execute(action)\n",
+    "            stdout = result.get(\"stdout\", \"\") if isinstance(result, dict) else str(result)\n",
+    "            stderr = result.get(\"stderr\", \"\") if isinstance(result, dict) else \"\"\n",
+    "            obs = stdout.strip() or \"(empty output)\"\n",
+    "            if result.get(\"exit_code\", 0) != 0:\n",
+    "                obs = f\"{stdout}\\n[stderr]: {stderr}\".strip()\n",
+    "            messages.append({\"role\": \"assistant\", \"content\": text})\n",
+    "            messages.append({\"role\": \"user\", \"content\": f\"<observation>\\n{obs}\\n</observation>\"})\n",
     "        elif done:\n",
-    "            return {\"steps\": steps, \"completed\": True, \"iterations\": iteration + 1, \"summary\": done, \"usage\": total_usage}\n",
+    "            return {\"completed\": True, \"iterations\": i + 1}\n",
     "        else:\n",
-    "            messages.append({\"role\": \"assistant\", \"content\": response_text})\n",
-    "            messages.append({\"role\": \"user\", \"content\": \"Please respond with either an <action> to execute or <done> if the task is complete.\"})\n",
-    "\n",
-    "    return {\"steps\": steps, \"completed\": False, \"iterations\": max_iterations, \"summary\": None, \"usage\": total_usage}"
+    "            messages.append({\"role\": \"assistant\", \"content\": text})\n",
+    "            messages.append({\"role\": \"user\", \"content\": \"Please respond with <action> or <done>.\"})\n",
+    "    return {\"completed\": False, \"iterations\": max_iterations}"
    ]
   },
   {
@@ -256,127 +149,54 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from tqdm.auto import tqdm\n",
+    "client = AgentDiff()\n",
+    "dataset = load_dataset(\"hubertmarek/agent-diff-bench\", split=\"test\")\n",
     "\n",
+    "results = []\n",
     "\n",
-    "def run_single_test(client: AgentDiff, model: str, test, system_prompt: str, max_iterations: int, timeout: int) -> dict:\n",
-    "    \"\"\"Run one test: init env -> agent loop -> evaluate -> cleanup.\"\"\"\n",
-    "    env = None\n",
-    "    try:\n",
-    "        env = client.init_env(testId=test.id)\n",
-    "        run = client.start_run(envId=env.environmentId, testId=test.id)\n",
-    "        bash_executor = BashExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n",
-    "\n",
-    "        start = time.perf_counter()\n",
-    "        trace = run_react_agent(model, test.prompt, bash_executor, system_prompt, max_iterations)\n",
-    "        elapsed = time.perf_counter() - start\n",
-    "\n",
-    "        client.evaluate_run(runId=run.runId)\n",
-    "        result = client.get_results_for_run(runId=run.runId)\n",
-    "        client.delete_env(envId=env.environmentId)\n",
-    "\n",
-    "        return {\n",
-    "            \"test_id\": str(test.id),\n",
-    "            \"test_name\": getattr(test, \"name\", \"\"),\n",
-    "            \"passed\": result.passed,\n",
-    "            \"score\": result.score.get(\"percent\", 0) if isinstance(result.score, dict) else 0,\n",
-    "            \"failures\": result.failures,\n",
-    "            \"time\": round(elapsed, 2),\n",
-    "            \"iterations\": trace[\"iterations\"],\n",
-    "            \"completed\": trace[\"completed\"],\n",
-    "            \"usage\": trace[\"usage\"],\n",
-    "        }\n",
-    "    except Exception as e:\n",
-    "        if env:\n",
-    "            try:\n",
-    "                client.delete_env(envId=env.environmentId)\n",
-    "            except Exception:\n",
-    "                pass\n",
-    "        return {\"test_id\": str(test.id), \"test_name\": getattr(test, \"name\", \"\"), \"passed\": False, \"score\": 0, \"error\": str(e)}\n",
+    "for example in dataset.select(range(5)):  # First 5 tasks; remove .select() for full benchmark\n",
+    "    info = json.loads(example[\"info\"]) if isinstance(example[\"info\"], str) else example[\"info\"]\n",
+    "    expected = json.loads(example[\"answer\"]) if isinstance(example[\"answer\"], str) else example[\"answer\"]\n",
+    "    service = info[\"service\"]\n",
+    "    cfg = SERVICE_CONFIG[service]\n",
     "\n",
+    "    system_prompt = REACT_SYSTEM_PROMPT.format(\n",
+    "        service_name=cfg[\"name\"], base_url=cfg[\"base_url\"], extra_context=cfg[\"extra\"]\n",
+    "    )\n",
     "\n",
-    "def run_benchmark(model: str, services: list[str] | None = None, max_tests: int | None = None, max_iterations: int = 40, timeout: int = 480) -> list[dict]:\n",
-    "    \"\"\"Run the full benchmark across services. Returns list of result dicts.\"\"\"\n",
-    "    services = services or list(SERVICE_CONFIG.keys())\n",
-    "    client = AgentDiff()\n",
-    "    all_results = []\n",
-    "\n",
-    "    for service in services:\n",
-    "        config = SERVICE_CONFIG[service]\n",
-    "        system_prompt = build_system_prompt(service)\n",
-    "\n",
-    "        suite_list = client.list_test_suites(name=config[\"test_suite_name\"])\n",
-    "        if not suite_list.testSuites:\n",
-    "            print(f\"[SKIP] Test suite '{config['test_suite_name']}' not found.\")\n",
-    "            continue\n",
-    "        suite = client.get_test_suite(suite_list.testSuites[0].id, expand=True)\n",
-    "        tests = suite.tests[:max_tests] if max_tests else suite.tests\n",
-    "\n",
-    "        print(f\"\\n{'='*60}\")\n",
-    "        print(f\"  {config['name']} — {len(tests)} tests | model: {model}\")\n",
-    "        print(f\"{'='*60}\")\n",
-    "\n",
-    "        for test in tqdm(tests, desc=config[\"name\"]):\n",
-    "            result = run_single_test(client, model, test, system_prompt, max_iterations, timeout)\n",
-    "            result[\"service\"] = service\n",
-    "            result[\"model\"] = model\n",
-    "            all_results.append(result)\n",
-    "\n",
-    "            status = \"PASS\" if result.get(\"passed\") else \"FAIL\"\n",
-    "            score = result.get(\"score\", 0)\n",
-    "            tqdm.write(f\"  [{status}] {result.get('test_name', result['test_id'])[:60]}  score={score}\")\n",
+    "    print(f\"Running: {example.get('test_name', example['test_id'])}\")\n",
     "\n",
-    "    return all_results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "results = run_benchmark(\n",
-    "    model=MODEL,\n",
-    "    services=None,          # all 4 services; or e.g. [\"slack\", \"box\"]\n",
-    "    max_tests=MAX_TESTS,\n",
-    "    max_iterations=MAX_ITERATIONS,\n",
-    "    timeout=TIMEOUT_SECONDS,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
+    "    env = client.init_env(\n",
+    "        templateService=info[\"service\"],\n",
+    "        templateName=info[\"seed_template\"],\n",
+    "        impersonateUserId=info[\"impersonate_user_id\"],\n",
+    "    )\n",
+    "    run = client.start_run(envId=env.environmentId)\n",
+    "    bash = BashExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)\n",
     "\n",
-    "df = pd.DataFrame(results)\n",
+    "    start = time.perf_counter()\n",
+    "    try:\n",
+    "        trace = run_react_agent(MODEL, example[\"question\"], bash, system_prompt)\n",
+    "    except Exception as e:\n",
+    "        trace = {\"completed\": False, \"error\": str(e)}\n",
+    "    elapsed = time.perf_counter() - start\n",
     "\n",
-    "print(\"\\n\" + \"=\" * 60)\n",
-    "print(f\"  Results: {MODEL}\")\n",
-    "print(\"=\" * 60)\n",
+    "    client.evaluate_run(runId=run.runId, expectedOutput=expected)\n",
+    "    result = client.get_results_for_run(runId=run.runId)\n",
     "\n",
-    "if \"service\" in df.columns and \"score\" in df.columns:\n",
-    "    summary = df.groupby(\"service\").agg(\n",
-    "        tests=(\"score\", \"count\"),\n",
-    "        passed=(\"passed\", \"sum\"),\n",
-    "        mean_score=(\"score\", \"mean\"),\n",
-    "        pass_rate=(\"passed\", \"mean\"),\n",
-    "    ).round(2)\n",
-    "    summary[\"pass_rate\"] = (summary[\"pass_rate\"] * 100).round(1)\n",
-    "    print(\"\\nPer-service summary:\")\n",
-    "    print(summary.to_string())\n",
+    "    results.append({\n",
+    "        \"test_id\": example[\"test_id\"],\n",
+    "        \"service\": service,\n",
+    "        \"passed\": result.passed,\n",
+    "        \"score\": result.score,\n",
+    "        \"time\": round(elapsed, 1),\n",
+    "    })\n",
+    "    print(f\"  {'PASS' if result.passed else 'FAIL'} | score={result.score} | {elapsed:.1f}s\")\n",
     "\n",
-    "    overall_score = df[\"score\"].mean()\n",
-    "    overall_pass = df[\"passed\"].mean() * 100\n",
-    "    total_cost = sum(r.get(\"usage\", {}).get(\"cost\", 0) for r in results)\n",
-    "    print(f\"\\nOverall: score={overall_score:.1f}  pass_rate={overall_pass:.1f}%  cost=${total_cost:.4f}\")\n",
+    "    client.delete_env(envId=env.environmentId)\n",
     "\n",
-    "    summary[\"mean_score\"].plot.bar(title=f\"Agent-Diff Score by Service ({MODEL})\", ylabel=\"Score\", xlabel=\"Service\", rot=0)\n",
-    "else:\n",
-    "    print(df)"
+    "passed = sum(1 for r in results if r[\"passed\"])\n",
+    "print(f\"\\nResults: {passed}/{len(results)} passed\")"
    ]
   }
  ],