diff --git a/DockerfileLocal b/DockerfileLocal
index f934d97498..c7a92be406 100644
--- a/DockerfileLocal
+++ b/DockerfileLocal
@@ -33,4 +33,4 @@ EXPOSE 22 80 9000-9009
RUN chmod +x /exe/initialize.sh /exe/run_A0.sh /exe/run_searxng.sh /exe/run_tunnel_api.sh
# initialize runtime and switch to supervisord
-CMD ["/exe/initialize.sh", "$BRANCH"]
+CMD ["/exe/initialize.sh", "$BRANCH"]
\ No newline at end of file
diff --git a/agent.py b/agent.py
index 594dc37bc5..4f1ea4863d 100644
--- a/agent.py
+++ b/agent.py
@@ -275,7 +275,6 @@ class AgentConfig:
chat_model: models.ModelConfig
utility_model: models.ModelConfig
embeddings_model: models.ModelConfig
- browser_model: models.ModelConfig
mcp_servers: str
profile: str = ""
memory_subdir: str = ""
@@ -287,7 +286,12 @@ class AgentConfig:
code_exec_ssh_user: str = "root"
code_exec_ssh_pass: str = ""
additional: Dict[str, Any] = field(default_factory=dict)
-
+ browser_control_headless: bool = False # Browser GUI enabled for interaction (uses VNC if available, otherwise X11 forwarding)
+ browser_control_cdp_url: str = "" # Chrome DevTools Protocol URL for native browser (e.g., "ws://host.docker.internal:9222/devtools/browser/..."), leave empty to use embedded browser with VNC
+ browser_control_start_url: str = "https://www.google.com"
+ browser_control_timeout: int = 5000 # milliseconds
+ # VNC is automatically enabled if available (configured in docker-compose.yml)
+ # Access browser control via noVNC when agent calls pause_for_user method
@dataclass
class UserMessage:
@@ -676,14 +680,6 @@ def get_utility_model(self):
**self.config.utility_model.build_kwargs(),
)
- def get_browser_model(self):
- return models.get_browser_model(
- self.config.browser_model.provider,
- self.config.browser_model.name,
- model_config=self.config.browser_model,
- **self.config.browser_model.build_kwargs(),
- )
-
def get_embedding_model(self):
return models.get_embedding_model(
self.config.embeddings_model.provider,
diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile
index 7e94ed80a5..79dfaab174 100644
--- a/docker/base/Dockerfile
+++ b/docker/base/Dockerfile
@@ -27,6 +27,12 @@ RUN bash /ins/install_base_packages4.sh
# install python after packages to ensure version overriding
RUN bash /ins/install_python.sh
+# install X11 support for browser display
+RUN bash /ins/install_x11_support.sh
+
+# install VNC server and noVNC for remote browser control
+RUN bash /ins/install_vnc.sh
+
# install searxng
RUN bash /ins/install_searxng.sh
diff --git a/docker/run/docker-compose.yml b/docker/run/docker-compose.yml
index cc48f3f1ba..a90a70f71d 100644
--- a/docker/run/docker-compose.yml
+++ b/docker/run/docker-compose.yml
@@ -1,8 +1,53 @@
services:
agent-zero:
container_name: agent-zero
- image: agent0ai/agent-zero:latest
+ # Use local development image (build with: docker build -f DockerfileLocal -t agent-zero-local --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) .)
+ image: agent-zero-local
+ # Use Docker Hub image for production deployments
+ # image: agent0ai/agent-zero:latest
volumes:
- - ./agent-zero:/a0
+ # Mount the actual project root (not the outdated copy in ./agent-zero)
+ # This allows live development - changes reflected immediately without rebuild
+ - ../..:/a0
+ # X11 socket for GUI display on macOS (auto-configured)
+ - /tmp/.X11-unix:/tmp/.X11-unix:rw
ports:
- - "50080:80"
\ No newline at end of file
+ - "55022:22"
+ - "50080:80"
+ - "56080:6080" # noVNC web client for browser control
+ - "50090:9000"
+ - "50091:9001"
+ - "50092:9002"
+ - "50093:9003"
+ - "50094:9004"
+ - "50095:9005"
+ - "50096:9006"
+ - "50097:9007"
+ - "50098:9008"
+ - "50099:9009"
+ environment:
+ # X11 display forwarding via TCP (Docker Desktop on macOS uses VM)
+ - DISPLAY=host.docker.internal:0
+ - XAUTHORITY=/tmp/.Xauthority
+ # VNC configuration for remote browser control
+ - VNC_DISPLAY=:99
+ - VNC_RESOLUTION=1920x1080x24
+ - VNC_PORT=5900
+ - NOVNC_PORT=6080
+ - NOVNC_EXTERNAL_PORT=56080 # External port mapping for noVNC access
+ - VNC_PASSWORD=agent-zero
+ # Allow container to reach host for X11
+ extra_hosts:
+ - "host.docker.internal:host-gateway"
+ # Security options for X11
+ security_opt:
+ - seccomp:unconfined
+ # Shared memory for browser (required for Chromium)
+ shm_size: '2gb'
+ # Auto-check and setup display and VNC on startup
+ command: >
+ bash -c "
+ /exe/check_display.sh || true &&
+ /exe/start_vnc.sh || true &&
+ /exe/initialize.sh development
+ "
\ No newline at end of file
diff --git a/docker/run/fs/etc/supervisor/conf.d/vnc.conf b/docker/run/fs/etc/supervisor/conf.d/vnc.conf
new file mode 100644
index 0000000000..e20409b2da
--- /dev/null
+++ b/docker/run/fs/etc/supervisor/conf.d/vnc.conf
@@ -0,0 +1,13 @@
+[program:run_vnc]
+command=/exe/start_vnc.sh
+environment=
+user=root
+stopwaitsecs=10
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+autorestart=true
+startretries=3
+stopasgroup=true
+killasgroup=true
diff --git a/docker/run/fs/exe/check_display.sh b/docker/run/fs/exe/check_display.sh
new file mode 100755
index 0000000000..d0b2e7813a
--- /dev/null
+++ b/docker/run/fs/exe/check_display.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# Automatic X11 display setup checker
+# Runs on container startup to verify display forwarding
+# No user interaction required - fully automatic
+
+set -e
+
+echo "========================================"
+echo "Agent Zero - Display Setup Check"
+echo "========================================"
+
+# Detect if running on macOS host
+IS_MACOS=false
+if [ -f /tmp/.X11-unix ] || [ "$DISPLAY" = "host.docker.internal:0" ]; then
+ IS_MACOS=true
+fi
+
+# Check if DISPLAY is set
+if [ -z "$DISPLAY" ]; then
+ echo "⚠️ No display configured (headless mode)"
+ echo " Browser will run in headless mode (invisible)"
+ echo ""
+ echo "To enable visible browser on macOS:"
+ echo " 1. Install XQuartz: https://www.xquartz.org/"
+ echo " 2. Start XQuartz and restart Agent Zero"
+ exit 0
+fi
+
+# Display is configured - verify X11 libraries
+echo "✓ Display configured: $DISPLAY"
+
+# Check if X11 libraries are installed
+if ! dpkg -l | grep -q libx11-6; then
+ echo "Installing X11 libraries for browser display..."
+ apt-get update -qq
+ DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \
+ libx11-6 libxcb1 libxcomposite1 libxcursor1 libxdamage1 \
+ libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 \
+ libxtst6 libgbm1 libasound2 libatk1.0-0 libatk-bridge2.0-0 \
+ libcups2 libdrm2 libgtk-3-0 libnspr4 libnss3 \
+ 2>&1 | grep -v "^Reading" | grep -v "^Building" || true
+fi
+
+echo "✓ X11 libraries installed"
+
+# Test X11 connection
+if [ "$IS_MACOS" = true ]; then
+ echo "Testing X11 connection to macOS host..."
+
+ # Try to connect to X11
+ timeout 2 xdpyinfo -display "$DISPLAY" > /dev/null 2>&1 && {
+ echo "✓ X11 connection successful"
+ echo "✓ Browser will appear on your screen"
+ exit 0
+ } || {
+ echo ""
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+ echo "⚠️ Cannot connect to X11 display"
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+ echo ""
+ echo "To see the browser window, you need XQuartz:"
+ echo ""
+ echo " 1. Download and install XQuartz:"
+ echo " https://www.xquartz.org/"
+ echo ""
+ echo " 2. Log out and log back in (required!)"
+ echo ""
+ echo " 3. Allow Docker connections:"
+ echo " xhost +localhost"
+ echo ""
+ echo " 4. Restart Agent Zero:"
+ echo " cd docker/run && docker-compose restart"
+ echo ""
+ echo "For now, browser will run in headless mode."
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+ echo ""
+ exit 0
+ }
+fi
+
+echo "✓ Display setup complete"
+echo "========================================"
diff --git a/docker/run/fs/exe/initialize.sh b/docker/run/fs/exe/initialize.sh
index 8c329bb304..ba6b5d9cc3 100644
--- a/docker/run/fs/exe/initialize.sh
+++ b/docker/run/fs/exe/initialize.sh
@@ -19,5 +19,8 @@ chmod 444 /root/.profile
# update package list to save time later
apt-get update > /dev/null 2>&1 &
+# Start VNC server in the background (for browser control feature)
+/exe/start_vnc.sh > /tmp/vnc_startup.log 2>&1 &
+
# let supervisord handle the services
exec /usr/bin/supervisord -c /etc/supervisor/conf.d/supervisord.conf
diff --git a/docker/run/fs/exe/start_vnc.sh b/docker/run/fs/exe/start_vnc.sh
new file mode 100755
index 0000000000..906e5f43dc
--- /dev/null
+++ b/docker/run/fs/exe/start_vnc.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+# VNC Server Startup Script
+# Starts Xvfb, x11vnc, and noVNC for remote browser control
+# Can be safely run multiple times (idempotent)
+
+set -e
+
+echo "========================================"
+echo "Agent Zero - VNC Server Setup"
+echo "========================================"
+
+# Configuration from environment variables with defaults
+VNC_DISPLAY="${VNC_DISPLAY:-:99}"
+VNC_RESOLUTION="${VNC_RESOLUTION:-1920x1080x24}"
+VNC_PORT="${VNC_PORT:-5900}"
+NOVNC_PORT="${NOVNC_PORT:-6080}"
+VNC_PASSWORD="${VNC_PASSWORD:-agent-zero}"
+
+# Extract display number (e.g., :99 -> 99)
+DISPLAY_NUM=$(echo $VNC_DISPLAY | tr -d ':')
+
+echo "Configuration:"
+echo " Display: $VNC_DISPLAY"
+echo " Resolution: $VNC_RESOLUTION"
+echo " VNC Port: $VNC_PORT"
+echo " noVNC Port: $NOVNC_PORT"
+echo "========================================"
+
+# Function to check if a process is running
+is_running() {
+ pgrep -f "$1" > /dev/null 2>&1
+}
+
+# Function to kill existing VNC processes
+cleanup_vnc() {
+ echo "Cleaning up existing VNC processes..."
+ pkill -f "Xvfb $VNC_DISPLAY" || true
+ pkill -f "x11vnc.*$VNC_DISPLAY" || true
+ pkill -f "websockify.*$NOVNC_PORT" || true
+ # Remove stale lock file (socket file removal may fail, but that's OK)
+ rm -f /tmp/.X${DISPLAY_NUM}-lock 2>/dev/null || true
+ rm -f /tmp/.X11-unix/X${DISPLAY_NUM} 2>/dev/null || true
+ sleep 1
+}
+
+# Check if already running - if so, skip to monitoring
+if is_running "Xvfb $VNC_DISPLAY" && is_running "x11vnc.*$VNC_DISPLAY" && is_running "websockify.*$NOVNC_PORT"; then
+ echo "✓ VNC server already running"
+ echo " - Xvfb on display $VNC_DISPLAY"
+ echo " - x11vnc on port $VNC_PORT"
+ echo " - noVNC web client on port $NOVNC_PORT"
+ echo "========================================"
+
+ # Skip to monitoring instead of exiting
+ # Find PIDs of running processes
+ XVFB_PID=$(pgrep -f "Xvfb $VNC_DISPLAY" | head -1)
+ X11VNC_PID=$(pgrep -f "x11vnc.*$VNC_DISPLAY" | head -1)
+ WEBSOCKIFY_PID=$(pgrep -f "websockify.*$NOVNC_PORT" | head -1)
+
+ # Create status file
+ mkdir -p /tmp/vnc
+ echo "DISPLAY=$VNC_DISPLAY" > /tmp/vnc/status
+ echo "VNC_PORT=$VNC_PORT" >> /tmp/vnc/status
+ echo "NOVNC_PORT=$NOVNC_PORT" >> /tmp/vnc/status
+ echo "XVFB_PID=$XVFB_PID" >> /tmp/vnc/status
+ echo "X11VNC_PID=$X11VNC_PID" >> /tmp/vnc/status
+ echo "WEBSOCKIFY_PID=$WEBSOCKIFY_PID" >> /tmp/vnc/status
+ echo "READY=true" >> /tmp/vnc/status
+
+ # Jump to monitoring loop
+ # Use a label/goto simulation by setting a flag
+ SKIP_STARTUP=true
+else
+ SKIP_STARTUP=false
+fi
+
+# Only run startup if not skipping
+if [ "$SKIP_STARTUP" = "false" ]; then
+
+# Clean up any partial VNC processes
+cleanup_vnc
+
+# Create VNC password file
+mkdir -p /root/.vnc
+echo "Setting VNC password..."
+x11vnc -storepasswd "$VNC_PASSWORD" /root/.vnc/passwd 2>/dev/null || {
+ echo "⚠️ Failed to set VNC password, trying alternative method..."
+ # Alternative method using printf and stdin
+ printf "%s\n%s\n" "$VNC_PASSWORD" "$VNC_PASSWORD" | x11vnc -storepasswd /root/.vnc/passwd 2>/dev/null || {
+ echo "⚠️ Password setup failed, VNC may not be accessible"
+ }
+}
+
+# Start Xvfb (X virtual framebuffer)
+echo "Starting Xvfb on display $VNC_DISPLAY..."
+Xvfb $VNC_DISPLAY -screen 0 $VNC_RESOLUTION -ac +extension GLX +render -noreset > /tmp/xvfb.log 2>&1 &
+XVFB_PID=$!
+
+# Wait for Xvfb to be ready
+sleep 2
+
+if ! is_running "Xvfb $VNC_DISPLAY"; then
+ echo "❌ Failed to start Xvfb"
+ cat /tmp/xvfb.log
+ exit 1
+fi
+
+echo "✓ Xvfb started successfully (PID: $XVFB_PID)"
+
+# Start x11vnc (VNC server)
+echo "Starting x11vnc on port $VNC_PORT..."
+x11vnc \
+ -display $VNC_DISPLAY \
+ -rfbport $VNC_PORT \
+ -rfbauth /root/.vnc/passwd \
+ -forever \
+ -shared \
+ -noxdamage \
+ -ncache 10 \
+ -ncache_cr \
+ -localhost \
+ -quiet \
+ > /tmp/x11vnc.log 2>&1 &
+X11VNC_PID=$!
+
+# Wait for x11vnc to be ready
+sleep 2
+
+if ! is_running "x11vnc.*$VNC_DISPLAY"; then
+ echo "❌ Failed to start x11vnc"
+ cat /tmp/x11vnc.log
+ exit 1
+fi
+
+echo "✓ x11vnc started successfully (PID: $X11VNC_PID)"
+
+# Find noVNC installation
+NOVNC_PATH=""
+if [ -d "/opt/novnc" ]; then
+ NOVNC_PATH="/opt/novnc"
+elif [ -d "/usr/share/novnc" ]; then
+ NOVNC_PATH="/usr/share/novnc"
+elif [ -d "/usr/share/noVNC" ]; then
+ NOVNC_PATH="/usr/share/noVNC"
+fi
+
+if [ -z "$NOVNC_PATH" ]; then
+ echo "⚠️ noVNC not found, VNC server running but no web access"
+ echo " You can still connect with a VNC client on port $VNC_PORT"
+ echo "========================================"
+ exit 0
+fi
+
+# Start websockify for noVNC
+echo "Starting noVNC web client on port $NOVNC_PORT..."
+websockify \
+ --web=$NOVNC_PATH \
+ $NOVNC_PORT \
+ localhost:$VNC_PORT \
+ > /tmp/websockify.log 2>&1 &
+WEBSOCKIFY_PID=$!
+
+# Wait for websockify to be ready
+sleep 2
+
+if ! is_running "websockify.*$NOVNC_PORT"; then
+ echo "⚠️ Failed to start websockify/noVNC"
+ cat /tmp/websockify.log
+ echo " VNC server is running, but web access unavailable"
+else
+ echo "✓ noVNC started successfully (PID: $WEBSOCKIFY_PID)"
+ echo ""
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+ echo "🎉 VNC Server Ready!"
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+ echo ""
+ echo " Web Access: http://localhost:$NOVNC_PORT/vnc.html"
+ echo " VNC Client: localhost:$DISPLAY_NUM (port $VNC_PORT)"
+ echo " Password: $VNC_PASSWORD"
+ echo ""
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+fi
+
+echo "========================================"
+
+# Create status file for other scripts to check (only if we started VNC)
+if [ "$SKIP_STARTUP" = "false" ]; then
+ mkdir -p /tmp/vnc
+ echo "DISPLAY=$VNC_DISPLAY" > /tmp/vnc/status
+ echo "VNC_PORT=$VNC_PORT" >> /tmp/vnc/status
+ echo "NOVNC_PORT=$NOVNC_PORT" >> /tmp/vnc/status
+ echo "XVFB_PID=$XVFB_PID" >> /tmp/vnc/status
+ echo "X11VNC_PID=$X11VNC_PID" >> /tmp/vnc/status
+ echo "WEBSOCKIFY_PID=$WEBSOCKIFY_PID" >> /tmp/vnc/status
+ echo "READY=true" >> /tmp/vnc/status
+fi
+
+# Close the startup section
+fi
+
+# VNC is now running in the background
+# Exit the script so initialize.sh can continue
+exit 0
diff --git a/docker/run/fs/ins/install_vnc.sh b/docker/run/fs/ins/install_vnc.sh
new file mode 100755
index 0000000000..fdf854f41a
--- /dev/null
+++ b/docker/run/fs/ins/install_vnc.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Install VNC server and noVNC for remote browser control
+# This allows users to manually interact with the browser when the agent pauses
+
+set -e
+
+echo "Installing VNC server and noVNC..."
+
+# Update package list
+apt-get update
+
+# Install Xvfb (X virtual framebuffer) for headless display
+# Install x11vnc for VNC server
+# Install websockify for WebSocket support (required by noVNC)
+# Install novnc for web-based VNC client
+DEBIAN_FRONTEND=noninteractive apt-get install -y \
+ xvfb \
+ x11vnc \
+ websockify \
+ novnc \
+ net-tools \
+ procps
+
+# Create VNC directory for password and configuration
+mkdir -p /root/.vnc
+
+# Set default VNC password (will be overridden by environment variable)
+# Using x11vnc password format - pass password as argument
+x11vnc -storepasswd "agent-zero" /root/.vnc/passwd 2>/dev/null || true
+
+# Create symlink for noVNC to easily find it
+# noVNC is typically installed in /usr/share/novnc
+if [ -d "/usr/share/novnc" ]; then
+ ln -sf /usr/share/novnc /opt/novnc
+elif [ -d "/usr/share/noVNC" ]; then
+ ln -sf /usr/share/noVNC /opt/novnc
+fi
+
+# Clean up
+apt-get clean
+rm -rf /var/lib/apt/lists/*
+
+echo "✓ VNC server and noVNC installed"
+echo " - Xvfb for virtual display"
+echo " - x11vnc for VNC server"
+echo " - noVNC for web-based access"
+echo " - Default VNC password: agent-zero (change via VNC_PASSWORD env var)"
diff --git a/docker/run/fs/ins/install_x11_support.sh b/docker/run/fs/ins/install_x11_support.sh
new file mode 100644
index 0000000000..6bbc4b5445
--- /dev/null
+++ b/docker/run/fs/ins/install_x11_support.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Install X11 and GUI support for browser display
+# This allows Chromium to display on the host machine via X11 forwarding
+
+set -e
+
+echo "Installing X11 and GUI support for browser display..."
+
+# Update package list
+apt-get update
+
+# Install X11 libraries and dependencies for GUI applications
+DEBIAN_FRONTEND=noninteractive apt-get install -y \
+ libx11-6 \
+ libx11-xcb1 \
+ libxcb1 \
+ libxcomposite1 \
+ libxcursor1 \
+ libxdamage1 \
+ libxext6 \
+ libxfixes3 \
+ libxi6 \
+ libxrandr2 \
+ libxrender1 \
+ libxss1 \
+ libxtst6 \
+ libxcb-dri3-0 \
+ libxcb-shm0 \
+ libxshmfence1 \
+ libgbm1 \
+ libatk1.0-0 \
+ libatk-bridge2.0-0 \
+ libcups2 \
+ libdrm2 \
+ libgtk-3-0 \
+ libnspr4 \
+ libnss3 \
+ libpango-1.0-0 \
+ libpangocairo-1.0-0 \
+ libglib2.0-0 \
+ libdbus-1-3 \
+ fonts-liberation \
+ xdg-utils
+
+# Install additional fonts for better browser rendering
+DEBIAN_FRONTEND=noninteractive apt-get install -y \
+ fonts-noto \
+ fonts-noto-cjk \
+ fonts-noto-color-emoji
+
+# Clean up
+apt-get clean
+rm -rf /var/lib/apt/lists/*
+
+echo "✓ X11 and GUI support installed"
diff --git a/initialize.py b/initialize.py
index 3c42c952e5..8f77e40c13 100644
--- a/initialize.py
+++ b/initialize.py
@@ -60,21 +60,11 @@ def _normalize_model_kwargs(kwargs: dict) -> dict:
limit_requests=current_settings["embed_model_rl_requests"],
kwargs=_normalize_model_kwargs(current_settings["embed_model_kwargs"]),
)
- # browser model from user settings
- browser_llm = models.ModelConfig(
- type=models.ModelType.CHAT,
- provider=current_settings["browser_model_provider"],
- name=current_settings["browser_model_name"],
- api_base=current_settings["browser_model_api_base"],
- vision=current_settings["browser_model_vision"],
- kwargs=_normalize_model_kwargs(current_settings["browser_model_kwargs"]),
- )
# agent configuration
config = AgentConfig(
chat_model=chat_llm,
utility_model=utility_llm,
embeddings_model=embedding_llm,
- browser_model=browser_llm,
profile=current_settings["agent_profile"],
memory_subdir=current_settings["agent_memory_subdir"],
knowledge_subdirs=[current_settings["agent_knowledge_subdir"], "default"],
diff --git a/models.py b/models.py
index 469925e49f..d155f275fa 100644
--- a/models.py
+++ b/models.py
@@ -25,7 +25,7 @@
from python.helpers.providers import get_provider_config
from python.helpers.rate_limiter import RateLimiter
from python.helpers.tokens import approximate_tokens
-from python.helpers import dirty_json, browser_use_monkeypatch
+from python.helpers import dirty_json
from langchain_core.language_models.chat_models import SimpleChatModel
from langchain_core.outputs.chat_generation import ChatGenerationChunk
@@ -43,7 +43,7 @@
from sentence_transformers import SentenceTransformer
-# disable extra logging, must be done repeatedly, otherwise browser-use will turn it back on for some reason
+# disable extra logging
def turn_off_logging():
os.environ["LITELLM_LOG"] = "ERROR" # only errors
litellm.suppress_debug_info = True
@@ -56,9 +56,8 @@ def turn_off_logging():
# init
load_dotenv()
turn_off_logging()
-browser_use_monkeypatch.apply()
-litellm.modify_params = True # helps fix anthropic tool calls by browser-use
+litellm.modify_params = True # helps fix anthropic tool calls
class ModelType(Enum):
CHAT = "Chat"
@@ -578,83 +577,6 @@ def __init__(self, wrapper, *args, **kwargs):
self.chat = AsyncAIChatReplacement._Chat(wrapper)
-from browser_use.llm import ChatOllama, ChatOpenRouter, ChatGoogle, ChatAnthropic, ChatGroq, ChatOpenAI
-
-class BrowserCompatibleChatWrapper(ChatOpenRouter):
- """
- A wrapper for browser agent that can filter/sanitize messages
- before sending them to the LLM.
- """
-
- def __init__(self, *args, **kwargs):
- turn_off_logging()
- # Create the underlying LiteLLM wrapper
- self._wrapper = LiteLLMChatWrapper(*args, **kwargs)
- # Browser-use may expect a 'model' attribute
- self.model = self._wrapper.model_name
- self.kwargs = self._wrapper.kwargs
-
- @property
- def model_name(self) -> str:
- return self._wrapper.model_name
-
- @property
- def provider(self) -> str:
- return self._wrapper.provider
-
- def get_client(self, *args, **kwargs): # type: ignore
- return AsyncAIChatReplacement(self, *args, **kwargs)
-
- async def _acall(
- self,
- messages: List[BaseMessage],
- stop: Optional[List[str]] = None,
- run_manager: Optional[CallbackManagerForLLMRun] = None,
- **kwargs: Any,
- ):
- # Apply rate limiting if configured
- apply_rate_limiter_sync(self._wrapper.a0_model_conf, str(messages))
-
- # Call the model
- try:
- model = kwargs.pop("model", None)
- kwrgs = {**self._wrapper.kwargs, **kwargs}
-
- # hack from browser-use to fix json schema for gemini (additionalProperties, $defs, $ref)
- if "response_format" in kwrgs and "json_schema" in kwrgs["response_format"] and model.startswith("gemini/"):
- kwrgs["response_format"]["json_schema"] = ChatGoogle("")._fix_gemini_schema(kwrgs["response_format"]["json_schema"])
-
- resp = await acompletion(
- model=self._wrapper.model_name,
- messages=messages,
- stop=stop,
- **kwrgs,
- )
-
- # Gemini: strip triple backticks and conform schema
- try:
- msg = resp.choices[0].message # type: ignore
- if self.provider == "gemini" and isinstance(getattr(msg, "content", None), str):
- cleaned = browser_use_monkeypatch.gemini_clean_and_conform(msg.content) # type: ignore
- if cleaned:
- msg.content = cleaned
- except Exception:
- pass
-
- except Exception as e:
- raise e
-
- # another hack for browser-use post process invalid jsons
- try:
- if "response_format" in kwrgs and "json_schema" in kwrgs["response_format"] or "json_object" in kwrgs["response_format"]:
- if resp.choices[0].message.content is not None and not resp.choices[0].message.content.startswith("{"): # type: ignore
- js = dirty_json.parse(resp.choices[0].message.content) # type: ignore
- resp.choices[0].message.content = dirty_json.stringify(js) # type: ignore
- except Exception as e:
- pass
-
- return resp
-
class LiteLLMEmbeddingWrapper(Embeddings):
model_name: str
kwargs: dict = {}
@@ -899,16 +821,6 @@ def get_chat_model(
)
-def get_browser_model(
- provider: str, name: str, model_config: Optional[ModelConfig] = None, **kwargs: Any
-) -> BrowserCompatibleChatWrapper:
- orig = provider.lower()
- provider_name, kwargs = _merge_provider_defaults("chat", orig, kwargs)
- return _get_litellm_chat(
- BrowserCompatibleChatWrapper, name, provider_name, model_config, **kwargs
- )
-
-
def get_embedding_model(
provider: str, name: str, model_config: Optional[ModelConfig] = None, **kwargs: Any
) -> LiteLLMEmbeddingWrapper | LocalSentenceTransformerWrapper:
diff --git a/prompts/agent.system.tool.browser.md b/prompts/agent.system.tool.browser.md
index 120316e155..f0a00c46ea 100644
--- a/prompts/agent.system.tool.browser.md
+++ b/prompts/agent.system.tool.browser.md
@@ -1,36 +1,186 @@
-### browser_agent:
-
-subordinate agent controls playwright browser
-message argument talks to agent give clear instructions credentials task based
-reset argument spawns new agent
-do not reset if iterating
-be precise descriptive like: open google login and end task, log in using ... and end task
-when following up start: considering open pages
-dont use phrase wait for instructions use end task
-downloads default in /a0/tmp/downloads
-pass secrets and variables in message when needed
-
-usage:
-```json
-{
- "thoughts": ["I need to log in to..."],
- "headline": "Opening new browser session for login",
- "tool_name": "browser_agent",
- "tool_args": {
- "message": "Open and log me into...",
- "reset": "true"
- }
-}
-```
-
-```json
-{
- "thoughts": ["I need to log in to..."],
- "headline": "Continuing with existing browser session",
- "tool_name": "browser_agent",
- "tool_args": {
- "message": "Considering open pages, click...",
- "reset": "false"
- }
-}
-```
+### browser_control
+
+granular browser control with individual actions
+use for precise web automation tasks when browser_agent is too high-level
+available methods: navigate, click, type, scroll, observe_page, select, press, hover, pause_for_user, get_browser_info
+screenshots captured automatically after each action for visual feedback
+
+**navigate** - go to URL
+**click** - click element by CSS selector or text
+**type** - type text into input field
+**scroll** - scroll page (direction: up/down/left/right)
+**observe_page** - get current page state, title, content, elements (adds screenshot to context)
+**select** - select option from dropdown
+**press** - press keyboard key on element
+**hover** - hover over element
+**pause_for_user** - pause execution for manual user interaction (CAPTCHAs, manual login, etc.)
+ - requires browser to be in visible mode (headless=False)
+ - waits specified seconds for user to interact with browser
+ - use when encountering CAPTCHAs, blocked automation, or manual verification needed
+**get_browser_info** - diagnostic tool to check browser visibility mode and troubleshoot
+ - shows current headless/visible mode
+ - displays configuration settings
+ - provides troubleshooting tips if browser not visible
+ - use when you can't see the browser window or need to verify settings
+
+session management:
+- browser state persists across calls
+- use reset arg to start fresh session
+- same page context maintained between actions
+- screenshots available in chat history
+
+usage examples:
+
+1. Navigate and observe
+~~~json
+{
+ "thoughts": ["Need to open the website and see what's there"],
+ "headline": "Opening website",
+ "tool_name": "browser_control:navigate",
+ "tool_args": {
+ "url": "https://example.com"
+ }
+}
+~~~
+
+2. Observe current page
+~~~json
+{
+ "thoughts": ["Let me see what's on this page"],
+ "headline": "Observing page content",
+ "tool_name": "browser_control:observe_page",
+ "tool_args": {}
+}
+~~~
+
+3. Click element
+~~~json
+{
+ "thoughts": ["Need to click the login button"],
+ "headline": "Clicking login button",
+ "tool_name": "browser_control:click",
+ "tool_args": {
+ "selector": "button[type='submit']"
+ }
+}
+~~~
+
+4. Type into field
+~~~json
+{
+ "thoughts": ["Entering username"],
+ "headline": "Typing username",
+ "tool_name": "browser_control:type",
+ "tool_args": {
+ "selector": "input[name='username']",
+ "text": "myusername"
+ }
+}
+~~~
+
+5. Scroll page
+~~~json
+{
+ "thoughts": ["Need to see more content"],
+ "headline": "Scrolling down",
+ "tool_name": "browser_control:scroll",
+ "tool_args": {
+ "direction": "down"
+ }
+}
+~~~
+
+6. Select dropdown option
+~~~json
+{
+ "thoughts": ["Need to select country from dropdown"],
+ "headline": "Selecting country",
+ "tool_name": "browser_control:select",
+ "tool_args": {
+ "selector": "select[name='country']",
+ "value": "USA"
+ }
+}
+~~~
+
+7. Press key
+~~~json
+{
+ "thoughts": ["Need to submit form with Enter key"],
+ "headline": "Pressing Enter",
+ "tool_name": "browser_control:press",
+ "tool_args": {
+ "selector": "input[name='search']",
+ "key": "Enter"
+ }
+}
+~~~
+
+8. Hover over element
+~~~json
+{
+ "thoughts": ["Need to hover over menu to reveal submenu"],
+ "headline": "Hovering over menu",
+ "tool_name": "browser_control:hover",
+ "tool_args": {
+ "selector": "#main-menu"
+ }
+}
+~~~
+
+9. Pause for user interaction
+~~~json
+{
+ "thoughts": ["Encountered a CAPTCHA that needs manual solving"],
+ "headline": "Pausing for CAPTCHA",
+ "tool_name": "browser_control:pause_for_user",
+ "tool_args": {
+ "wait_seconds": 120,
+ "message": "Please solve the CAPTCHA"
+ }
+}
+~~~
+
+10. Check browser visibility and settings
+~~~json
+{
+ "thoughts": ["User says they can't see the browser window, let me check the configuration"],
+ "headline": "Checking browser settings",
+ "tool_name": "browser_control:get_browser_info",
+ "tool_args": {}
+}
+~~~
+
+11. Reset session
+~~~json
+{
+ "thoughts": ["Browser session seems stuck, starting fresh"],
+ "headline": "Resetting browser session",
+ "tool_name": "browser_control:navigate",
+ "tool_args": {
+ "url": "https://example.com",
+ "reset": "true"
+ }
+}
+~~~
+
+**configuration:**
+- to enable visible browser for manual interaction: set `browser_control_headless: False` in agent config
+- default is headless mode (browser runs invisibly in background)
+- visible mode required for pause_for_user to work
+- start URL can be configured with `browser_control_start_url`
+- timeout can be configured with `browser_control_timeout` (milliseconds)
+
+**best practices:**
+- always observe_page first to understand current state
+- use specific CSS selectors when possible (id, class, name attribute)
+- for text-based clicking, selector will be treated as text content
+- handle failures gracefully - try alternative selectors if needed
+- reset session if browser gets stuck or navigation fails repeatedly
+- each action is atomic - chain multiple actions for complex workflows
+- screenshots show visual state after each action
+- observe_page adds screenshot to your context for vision analysis
+- use pause_for_user when encountering CAPTCHAs or automation blocks
+- use get_browser_info when user reports browser visibility issues
+- if browser was initialized in wrong mode, use reset=true to restart it
+
diff --git a/prompts/browser_agent.system.md b/prompts/browser_control.system.md
similarity index 100%
rename from prompts/browser_agent.system.md
rename to prompts/browser_control.system.md
diff --git a/python/api/browser_control.py b/python/api/browser_control.py
new file mode 100644
index 0000000000..1899ff66b4
--- /dev/null
+++ b/python/api/browser_control.py
@@ -0,0 +1,81 @@
+from python.helpers.api import ApiHandler, Request, Response
+from flask import send_file, redirect
+import os
+
+class BrowserControl(ApiHandler):
+ """
+ API endpoint for accessing the browser control interface (noVNC).
+ This allows users to manually interact with the browser when the agent pauses.
+ """
+
+ @classmethod
+ def requires_auth(cls) -> bool:
+ # Require authentication for browser control access
+ return True
+
+ @classmethod
+ def requires_csrf(cls) -> bool:
+ # CSRF not needed for GET requests
+ return False
+
+ @classmethod
+ def get_methods(cls) -> list[str]:
+ return ["GET"]
+
+ async def process(self, input: dict, request: Request) -> dict | Response:
+ """
+ Returns information about the VNC server and provides access to noVNC client.
+
+ Query parameters:
+ - action: 'info' (default) | 'redirect'
+ - info: Returns VNC connection details
+ - redirect: Redirects to the noVNC web client
+ """
+ action = request.args.get('action', 'info')
+
+ # Check if VNC is running by reading status file
+ vnc_status_file = '/tmp/vnc/status'
+ vnc_ready = False
+ vnc_display = ':99'
+ novnc_port = '6080'
+
+ # Get external port mapping from environment variable (for Docker port mapping)
+ # Default to 56080 which is the standard external mapping for noVNC port 6080
+ external_novnc_port = os.environ.get('NOVNC_EXTERNAL_PORT', '56080')
+
+ if os.path.exists(vnc_status_file):
+ try:
+ with open(vnc_status_file, 'r') as f:
+ status_lines = f.readlines()
+ status_dict = {}
+ for line in status_lines:
+ if '=' in line:
+ key, value = line.strip().split('=', 1)
+ status_dict[key] = value
+
+ vnc_ready = status_dict.get('READY', 'false') == 'true'
+ vnc_display = status_dict.get('DISPLAY', ':99')
+ novnc_port = status_dict.get('NOVNC_PORT', '6080')
+ except Exception as e:
+ pass
+
+ if action == 'redirect':
+ # Redirect to noVNC client using external port mapping with optimized parameters
+ novnc_url = f"http://localhost:{external_novnc_port}/vnc.html?autoconnect=true&resize=none&reconnect=true&reconnect_delay=1000&show_dot=true"
+ return redirect(novnc_url, code=302)
+
+ # Default: return info with optimized noVNC URL parameters
+ # Parameters:
+ # - autoconnect: Connect automatically on load
+ # - resize=scale: Scale the remote session to fit the viewport
+ # - reconnect: Automatically reconnect if connection is lost
+ # - reconnect_delay: Wait 1 second before reconnecting
+ # - show_dot: Show connection status indicator
+ return {
+ "vnc_ready": vnc_ready,
+ "vnc_display": vnc_display,
+ "novnc_port": novnc_port,
+ "external_novnc_port": external_novnc_port,
+ "novnc_url": f"http://localhost:{external_novnc_port}/vnc.html?autoconnect=true&resize=none&reconnect=true&reconnect_delay=1000&show_dot=true",
+ "instructions": "Click the noVNC URL to access the browser control interface" if vnc_ready else "VNC server is not running"
+ }
diff --git a/python/helpers/browser_control_client.py b/python/helpers/browser_control_client.py
new file mode 100644
index 0000000000..7a891fada9
--- /dev/null
+++ b/python/helpers/browser_control_client.py
@@ -0,0 +1,602 @@
+"""
+Browser Control Client - Playwright interface for browser automation.
+
+This module provides the PlaywrightClient for browser automation.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict, List, Optional
+import base64
+
+
+class ActionType(str, Enum):
+ """Supported action types for interface automation."""
+
+ CLICK = "click"
+ TYPE = "type"
+ SELECT = "select"
+ NAVIGATE = "navigate"
+ SCREENSHOT = "screenshot"
+ SCROLL = "scroll"
+ PRESS = "press"
+ HOVER = "hover"
+ PAUSE_FOR_USER = "pause_for_user"
+
+
+@dataclass
+class Action:
+ """Represents an action to be executed on an interface."""
+
+ action_type: ActionType
+ selector: Optional[str] = None
+ value: Optional[str] = None
+ coordinates: Optional[Dict[str, int]] = None
+ metadata: Dict[str, Any] = None
+
+ def __post_init__(self):
+ if self.metadata is None:
+ self.metadata = {}
+
+
+@dataclass
+class ActionResult:
+ """Result of executing an action on an interface."""
+
+ success: bool
+ description: str
+ error: Optional[str] = None
+ screenshot: Optional[bytes] = None
+ task_complete: bool = False
+ metadata: Dict[str, Any] = None
+
+ def __post_init__(self):
+ if self.metadata is None:
+ self.metadata = {}
+
+
+@dataclass
+class InterfaceState:
+ """Represents the current state of an interface."""
+
+ url: Optional[str] = None
+ title: Optional[str] = None
+ content: str = ""
+ interactive_elements: List[Dict[str, Any]] = None
+ screenshot: Optional[bytes] = None
+ metadata: Dict[str, Any] = None
+
+ def __post_init__(self):
+ if self.interactive_elements is None:
+ self.interactive_elements = []
+ if self.metadata is None:
+ self.metadata = {}
+
+
+@dataclass
+class BrowserControlState:
+ """State management for browser control tool."""
+
+ playwright: Optional[Any] = None
+ browser: Optional[Any] = None
+ context: Optional[Any] = None
+ page: Optional[Any] = None
+ client: Optional['PlaywrightClient'] = None
+ initialized: bool = False
+
+ def __del__(self):
+ """Cleanup on deletion."""
+ if self.initialized and self.client:
+ try:
+ import asyncio
+ # Try to close synchronously
+ try:
+ loop = asyncio.get_event_loop()
+ if loop.is_running():
+ loop.create_task(self.client.close())
+ else:
+ asyncio.run(self.client.close())
+ except RuntimeError:
+ pass
+ except Exception:
+ # Silently fail - destructor shouldn't raise
+ pass
+
+
+class PlaywrightClient:
+ """
+ Web interface automation using Playwright.
+
+ Provides browser automation capabilities for web applications.
+ """
+
+ def __init__(
+ self,
+ start_url: str = "https://www.google.com",
+ headless: bool = True,
+ playwright_binary: Optional[str] = None,
+ cdp_url: Optional[str] = None,
+ use_vnc: bool = False,
+ vnc_display: Optional[str] = None
+ ):
+ """
+ Initialize Playwright web client.
+
+ Args:
+ start_url: Initial URL to navigate to
+ headless: Whether to run browser in headless mode
+ playwright_binary: Path to Playwright binary (optional)
+ cdp_url: Chrome DevTools Protocol URL to connect to existing browser (optional)
+ e.g., "http://localhost:9222" or "http://host.docker.internal:9222"
+ use_vnc: Whether to use VNC display for browser visibility
+ vnc_display: VNC display number (e.g., ":99"). If None, read from VNC_DISPLAY env var
+ """
+ self.start_url = start_url
+ self.headless = headless
+ self.playwright_binary = playwright_binary
+ self.cdp_url = cdp_url
+ self.use_vnc = use_vnc
+ self.vnc_display = vnc_display
+ self.playwright = None
+ self.browser = None
+ self.context = None
+ self.page = None
+ self.action_history = []
+
+ async def initialize(self) -> None:
+ """Initialize Playwright browser session."""
+ try:
+ from playwright.async_api import async_playwright
+ except ImportError:
+ raise ImportError(
+ "Playwright is not installed. Install with: pip install playwright"
+ )
+
+ # Configure VNC display if enabled
+ if self.use_vnc:
+ import os
+ # Get VNC display from instance variable or environment
+ vnc_display = self.vnc_display or os.environ.get('VNC_DISPLAY', ':99')
+ # Set DISPLAY environment variable for Playwright to use VNC
+ original_display = os.environ.get('DISPLAY')
+ os.environ['DISPLAY'] = vnc_display
+ print(f"Using VNC display: {vnc_display}")
+ # Store original display to restore if needed
+ self._original_display = original_display
+
+ self.playwright = await async_playwright().start()
+
+ # Connect via CDP if URL provided (native browser mode)
+ if self.cdp_url:
+ print(f"Connecting to browser via CDP: {self.cdp_url}")
+
+ # Handle host.docker.internal Host header issue
+ # Convert HTTP endpoint to WebSocket to bypass Chrome's Host header validation
+ endpoint_url = self.cdp_url
+ if endpoint_url.startswith("http://") and "host.docker.internal" in endpoint_url:
+ import re
+ port_match = re.search(r':(\d+)', endpoint_url)
+ if port_match:
+ port = port_match.group(1)
+ # Use WebSocket format - less strict Host header checking
+ endpoint_url = f"ws://host.docker.internal:{port}"
+ print(f" → Converted to WebSocket: {endpoint_url}")
+
+ self.browser = await self.playwright.chromium.connect_over_cdp(
+ endpoint_url=endpoint_url,
+ timeout=30000 # 30 seconds
+ )
+ # Use the default context from the connected browser
+ self.context = self.browser.contexts[0] if self.browser.contexts else await self.browser.new_context(
+ viewport={"width": 800, "height": 1600}
+ )
+ # Use existing page or create new one
+ self.page = self.context.pages[0] if self.context.pages else await self.context.new_page()
+ # Navigate to start URL
+ await self.page.goto(self.start_url)
+ else:
+ # Launch browser with optional binary path (embedded browser mode)
+ launch_options = {
+ "headless": self.headless,
+ "args": ["--headless=new"] if self.headless else []
+ }
+ if self.playwright_binary:
+ launch_options["executable_path"] = self.playwright_binary
+
+ self.browser = await self.playwright.chromium.launch(**launch_options)
+
+ # Create context with viewport size matching browser_agent
+ self.context = await self.browser.new_context(
+ viewport={"width": 800, "height": 1600}
+ )
+ self.page = await self.context.new_page()
+ await self.page.goto(self.start_url)
+
+ async def get_state(self, format: str = "hybrid") -> InterfaceState:
+ """Get current state of the web page."""
+ if not self.page:
+ raise RuntimeError("Browser not initialized. Call initialize() first.")
+
+ state = InterfaceState(url=self.page.url, title=await self.page.title())
+
+ if format in ["text", "hybrid"]:
+ # Get text content
+ state.content = await self.page.content()
+
+ # Get interactive elements
+ elements = await self._get_interactive_elements()
+ state.interactive_elements = elements
+
+ if format in ["visual", "hybrid"]:
+ # Get screenshot
+ state.screenshot = await self.get_screenshot()
+
+ return state
+
+ async def _get_interactive_elements(self) -> List[Dict[str, Any]]:
+ """Extract interactive elements from the page."""
+ if not self.page:
+ return []
+
+ try:
+ elements = await self.page.evaluate(
+ """
+ () => {
+ const interactiveSelectors = [
+ 'button', 'a', 'input', 'select', 'textarea',
+ '[role="button"]', '[role="link"]', '[onclick]'
+ ];
+
+ const elements = [];
+ interactiveSelectors.forEach(selector => {
+ document.querySelectorAll(selector).forEach(el => {
+ if (el.offsetParent !== null) { // Is visible
+ elements.push({
+ tag: el.tagName.toLowerCase(),
+ text: el.innerText || el.value || '',
+ type: el.type || '',
+ placeholder: el.placeholder || '',
+ href: el.href || '',
+ selector: el.id ? `#${el.id}` :
+ el.className ? `.${el.className.split(' ')[0]}` :
+ el.tagName.toLowerCase()
+ });
+ }
+ });
+ });
+ return elements;
+ }
+ """
+ )
+ return elements
+ except Exception:
+ return []
+
+ async def execute_action(self, action: Action) -> ActionResult:
+ """Execute an action on the web page."""
+ if not self.page:
+ raise RuntimeError("Browser not initialized. Call initialize() first.")
+
+ try:
+ if action.action_type == ActionType.NAVIGATE:
+ if not action.value:
+ raise ValueError("Navigate action requires a URL value")
+
+ # Try navigation with robust fallback strategy
+ try:
+ # First attempt: wait for networkidle (ideal but may timeout on slow sites)
+ await self.page.goto(
+ action.value, wait_until="networkidle", timeout=5000
+ )
+ result = ActionResult(
+ success=True, description=f"Navigated to {action.value}"
+ )
+ except Exception as e:
+ # Fallback: if networkidle times out, check if page loaded at all
+ current_url = self.page.url
+ if current_url and (
+ action.value in current_url or current_url != "about:blank"
+ ):
+ # Page loaded even if not fully idle - consider it a success
+ try:
+ # Wait a bit for DOM to be ready
+ await self.page.wait_for_load_state(
+ "domcontentloaded", timeout=5000
+ )
+ except:
+ pass
+ result = ActionResult(
+ success=True,
+ description=f"Navigated to {action.value} (page loaded but not fully idle)",
+ )
+ else:
+ # Navigation truly failed
+ raise e
+
+ elif action.action_type == ActionType.CLICK:
+ if not action.selector:
+ raise ValueError("Click action requires a selector")
+
+ # Try different selector strategies with detailed error tracking
+ clicked = False
+ selector = action.selector
+ attempted_selectors = []
+ last_error = None
+
+ # Strategy 1: Direct CSS selector (wait for visibility first)
+ try:
+ # Wait for element to be visible before clicking
+ await self.page.wait_for_selector(selector, state="visible", timeout=3000)
+ await self.page.click(selector, timeout=2000)
+ clicked = True
+ except Exception as e:
+ attempted_selectors.append(f"CSS:{selector}")
+ last_error = str(e)
+
+ # Strategy 2: If selector contains :contains(), extract and try text-based
+ if ":contains(" in selector and not clicked:
+ import re
+ match = re.search(r":contains\(['\"]?(.*?)['\"]?\)", selector)
+ if match:
+ text = match.group(1)
+
+ # Try exact text match
+ try:
+ await self.page.wait_for_selector(f"text={text}", state="visible", timeout=2000)
+ await self.page.click(f"text={text}", timeout=2000)
+ clicked = True
+ selector = f"text={text}"
+ except Exception as e2:
+ attempted_selectors.append(f"text={text}")
+ last_error = str(e2)
+
+ # Try partial text match
+ try:
+ await self.page.click(f"text=/.*{text}.*/i", timeout=2000)
+ clicked = True
+ selector = f"text=/.*{text}.*/i"
+ except Exception as e3:
+ attempted_selectors.append(f"text=/.*{text}.*/i")
+ last_error = str(e3)
+
+ # Try href match for links
+ try:
+ link_selector = f"a[href*='{text.lower()}']"
+ await self.page.click(link_selector, timeout=2000)
+ clicked = True
+ selector = link_selector
+ except Exception as e4:
+ attempted_selectors.append(link_selector)
+ last_error = str(e4)
+
+ # Strategy 3: If plain text (not CSS), try as text selector
+ if (
+ not clicked
+ and not selector.startswith("#")
+ and not selector.startswith(".")
+ and not selector.startswith("[")
+ ):
+ # Try exact text
+ try:
+ await self.page.click(f"text={selector}", timeout=2000)
+ clicked = True
+ selector = f"text={selector}"
+ except Exception as e5:
+ attempted_selectors.append(f"text={selector}")
+ last_error = str(e5)
+
+ # Try partial text match (case-insensitive)
+ try:
+ await self.page.click(f"text=/.*{selector}.*/i", timeout=2000)
+ clicked = True
+ selector = f"text=/.*{selector}.*/i"
+ except Exception as e6:
+ attempted_selectors.append(f"text=/.*{selector}.*/i")
+ last_error = str(e6)
+
+ # Strategy 4: Force click if element is covered (e.g., by ads)
+ if not clicked:
+ try:
+ original_selector = action.selector
+ # Try to locate the element and force click
+ await self.page.click(original_selector, force=True, timeout=2000)
+ clicked = True
+ selector = f"{original_selector} (forced)"
+ except Exception as e7:
+ attempted_selectors.append(f"force:{original_selector}")
+ last_error = str(e7)
+
+ if not clicked:
+ # Provide helpful error message with all attempted strategies
+ error_msg = f"Failed to click element. Attempted selectors: {', '.join(attempted_selectors)}. Last error: {last_error}"
+ raise Exception(error_msg)
+
+ result = ActionResult(
+ success=True, description=f"Clicked on {selector}"
+ )
+
+ elif action.action_type == ActionType.TYPE:
+ if not action.selector or not action.value:
+ raise ValueError("Type action requires both selector and value")
+ # Wait for input to be visible before typing
+ await self.page.wait_for_selector(action.selector, state="visible", timeout=3000)
+ await self.page.fill(action.selector, action.value)
+ result = ActionResult(
+ success=True,
+ description=f"Typed '{action.value}' into {action.selector}",
+ )
+
+ elif action.action_type == ActionType.SELECT:
+ if not action.selector or not action.value:
+ raise ValueError("Select action requires both selector and value")
+ # Wait for select element to be visible
+ await self.page.wait_for_selector(action.selector, state="visible", timeout=3000)
+ await self.page.select_option(action.selector, action.value)
+ result = ActionResult(
+ success=True,
+ description=f"Selected '{action.value}' in {action.selector}",
+ )
+
+ elif action.action_type == ActionType.PRESS:
+ if not action.selector or not action.value:
+ raise ValueError("Press action requires both selector and value")
+ # Wait for element to be visible before pressing key
+ await self.page.wait_for_selector(action.selector, state="visible", timeout=3000)
+ await self.page.press(action.selector, action.value)
+ result = ActionResult(
+ success=True,
+ description=f"Pressed '{action.value}' on {action.selector}",
+ )
+
+ elif action.action_type == ActionType.SCROLL:
+ # Map direction to scroll values
+ direction = action.value or "down"
+ scroll_x, scroll_y = 0, 0
+
+ if direction == "down":
+ scroll_y = 500
+ elif direction == "up":
+ scroll_y = -500
+ elif direction == "right":
+ scroll_x = 500
+ elif direction == "left":
+ scroll_x = -500
+ else:
+ # If it's a number, use it directly for vertical scrolling
+ try:
+ scroll_y = int(direction)
+ except ValueError:
+ scroll_y = 500 # Default to scrolling down
+
+ await self.page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
+ result = ActionResult(
+ success=True,
+ description=f"Scrolled {direction} by {abs(scroll_y or scroll_x)} pixels",
+ )
+
+ elif action.action_type == ActionType.HOVER:
+ if not action.selector:
+ raise ValueError("Hover action requires a selector")
+ # Wait for element to be visible before hovering
+ await self.page.wait_for_selector(action.selector, state="visible", timeout=3000)
+ await self.page.hover(action.selector)
+ result = ActionResult(
+ success=True, description=f"Hovered over {action.selector}"
+ )
+
+ elif action.action_type == ActionType.PAUSE_FOR_USER:
+ # Pause execution and wait for user interaction
+ # This is useful for CAPTCHAs, manual login, or other user interventions
+ wait_time = int(action.value) if action.value else 60
+ message = action.metadata.get("message", "Pausing for user interaction...")
+
+ # Check if VNC is available for user interaction
+ vnc_url = self.get_vnc_url(host="localhost", port=56080)
+
+ if not vnc_url and self.headless:
+ # No VNC and headless - user has no way to interact
+ result = ActionResult(
+ success=False,
+ description="",
+ error="Cannot pause for user: browser is in headless mode and VNC is not available. Set headless=False or enable VNC when initializing the browser."
+ )
+ else:
+ # VNC is available or browser is visible - user can interact
+ # Return immediately without blocking - let the agent handle the pause
+ print(f"\n{'='*60}")
+ print(f"BROWSER READY FOR USER INTERACTION: {message}")
+ print(f"Current URL: {self.page.url}")
+ if vnc_url:
+ print(f"VNC URL: {vnc_url}")
+ print(f"Browser control panel will open automatically in web UI")
+ else:
+ print(f"Browser window should be visible on your display")
+ print(f"Agent will wait up to {wait_time} seconds")
+ print(f"{'='*60}\n")
+
+ result = ActionResult(
+ success=True,
+ description=f"Browser ready for user interaction. Agent will pause for up to {wait_time} seconds. Current page: {self.page.url}"
+ )
+
+ else:
+ result = ActionResult(
+ success=False,
+ description="",
+ error=f"Unsupported action type: {action.action_type}",
+ )
+
+ # Record action in history
+ self.action_history.append(action)
+
+ # Add screenshot if requested
+ if action.metadata.get("capture_screenshot", False):
+ result.screenshot = await self.get_screenshot()
+
+ return result
+
+ except Exception as e:
+ return ActionResult(success=False, description="", error=str(e))
+
+ async def get_screenshot(self) -> bytes:
+ """Get screenshot of current page as PNG bytes."""
+ if not self.page:
+ raise RuntimeError("Browser not initialized. Call initialize() first.")
+
+ return await self.page.screenshot(type="png", full_page=False)
+
+ async def get_screenshot_base64(self) -> str:
+ """Get screenshot of current page as base64 string for LLM context."""
+ screenshot_bytes = await self.get_screenshot()
+ return base64.b64encode(screenshot_bytes).decode('utf-8')
+
+ def get_vnc_url(self, host: str = "localhost", port: int = 6080) -> Optional[str]:
+ """
+ Get the noVNC URL for manual browser control.
+
+ Args:
+ host: Host where noVNC is accessible (default: localhost)
+ port: Port where noVNC is accessible (default: 6080)
+
+ Returns:
+ noVNC URL if VNC is enabled, None otherwise
+ """
+ if not self.use_vnc:
+ return None
+
+ import os
+ # Check if VNC is ready
+ vnc_status_file = '/tmp/vnc/status'
+ if not os.path.exists(vnc_status_file):
+ return None
+
+ try:
+ with open(vnc_status_file, 'r') as f:
+ status_lines = f.readlines()
+ status_dict = {}
+ for line in status_lines:
+ if '=' in line:
+ key, value = line.strip().split('=', 1)
+ status_dict[key] = value
+
+ vnc_ready = status_dict.get('READY', 'false') == 'true'
+ if not vnc_ready:
+ return None
+
+ novnc_port = status_dict.get('NOVNC_PORT', str(port))
+ return f"http://{host}:{novnc_port}/vnc.html?autoconnect=true&resize=none"
+ except Exception:
+ return None
+
+ async def close(self) -> None:
+ """Close browser and clean up."""
+ if self.page:
+ await self.page.close()
+ if self.context:
+ await self.context.close()
+ if self.browser:
+ await self.browser.close()
+ if self.playwright:
+ await self.playwright.stop()
+
diff --git a/python/helpers/browser_use.py b/python/helpers/browser_use.py
deleted file mode 100644
index 5c1800d2e4..0000000000
--- a/python/helpers/browser_use.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from python.helpers import dotenv
-dotenv.save_dotenv_value("ANONYMIZED_TELEMETRY", "false")
-import browser_use
-import browser_use.utils
\ No newline at end of file
diff --git a/python/helpers/browser_use_monkeypatch.py b/python/helpers/browser_use_monkeypatch.py
deleted file mode 100644
index 8f77ca9e6b..0000000000
--- a/python/helpers/browser_use_monkeypatch.py
+++ /dev/null
@@ -1,162 +0,0 @@
-from typing import Any
-from browser_use.llm import ChatGoogle
-from python.helpers import dirty_json
-
-
-# ------------------------------------------------------------------------------
-# Gemini Helper for Output Conformance
-# ------------------------------------------------------------------------------
-# This function sanitizes and conforms the JSON output from Gemini to match
-# the specific schema expectations of the browser-use library. It handles
-# markdown fences, aliases actions (like 'complete_task' to 'done'), and
-# intelligently constructs a valid 'data' object for the final action.
-
-def gemini_clean_and_conform(text: str):
- obj = None
- try:
- # dirty_json parser is robust enough to handle markdown fences
- obj = dirty_json.parse(text)
- except Exception:
- return None # return None if parsing fails
-
- if not isinstance(obj, dict):
- return None
-
- # Conform actions to browser-use expectations
- if isinstance(obj.get("action"), list):
- normalized_actions = []
- for item in obj["action"]:
- if not isinstance(item, dict):
- continue # Skip non-dict items
-
- action_key, action_value = next(iter(item.items()), (None, None))
- if not action_key:
- continue
-
- # Alias 'complete_task' to 'done' to handle inconsistencies
- if action_key == "complete_task":
- action_key = "done"
-
- # Create a mutable copy of the value
- v = (action_value or {}).copy()
-
- if action_key in ("scroll_down", "scroll_up", "scroll"):
- is_down = action_key != "scroll_up"
- v.setdefault("down", is_down)
- v.setdefault("num_pages", 1.0)
- normalized_actions.append({"scroll": v})
- elif action_key == "go_to_url":
- v.setdefault("new_tab", False)
- normalized_actions.append({action_key: v})
- elif action_key == "done":
- # If `data` is missing, construct it from other keys
- if "data" not in v:
- # Pop fields from the top-level `done` object
- response_text = v.pop("response", None)
- summary_text = v.pop("page_summary", None)
- title_text = v.pop("title", "Task Completed")
-
- final_response = response_text or "Task completed successfully." # browser-use expects string
- final_summary = summary_text or "No page summary available." # browser-use expects string
-
- v["data"] = {
- "title": title_text,
- "response": final_response,
- "page_summary": final_summary,
- }
-
- v.setdefault("success", True)
- normalized_actions.append({action_key: v})
- else:
- normalized_actions.append(item)
- obj["action"] = normalized_actions
-
- return dirty_json.stringify(obj)
-
-# ------------------------------------------------------------------------------
-# Monkey-patch for browser-use Gemini schema issue
-# ------------------------------------------------------------------------------
-# The original _fix_gemini_schema in browser_use.llm.google.chat.ChatGoogle
-# removes the 'title' property but fails to remove it from the 'required' list,
-# causing a validation error with the Gemini API. This patch corrects that behavior.
-
-def _patched_fix_gemini_schema(self, schema: dict[str, Any]) -> dict[str, Any]:
- """
- Convert a Pydantic model to a Gemini-compatible schema.
-
- This function removes unsupported properties like 'additionalProperties' and resolves
- $ref references that Gemini doesn't support.
- """
-
- # Handle $defs and $ref resolution
- if '$defs' in schema:
- defs = schema.pop('$defs')
-
- def resolve_refs(obj: Any) -> Any:
- if isinstance(obj, dict):
- if '$ref' in obj:
- ref = obj.pop('$ref')
- ref_name = ref.split('/')[-1]
- if ref_name in defs:
- # Replace the reference with the actual definition
- resolved = defs[ref_name].copy()
- # Merge any additional properties from the reference
- for key, value in obj.items():
- if key != '$ref':
- resolved[key] = value
- return resolve_refs(resolved)
- return obj
- else:
- # Recursively process all dictionary values
- return {k: resolve_refs(v) for k, v in obj.items()}
- elif isinstance(obj, list):
- return [resolve_refs(item) for item in obj]
- return obj
-
- schema = resolve_refs(schema)
-
- # Remove unsupported properties
- def clean_schema(obj: Any) -> Any:
- if isinstance(obj, dict):
- # Remove unsupported properties
- cleaned = {}
- for key, value in obj.items():
- if key not in ['additionalProperties', 'title', 'default']:
- cleaned_value = clean_schema(value)
- # Handle empty object properties - Gemini doesn't allow empty OBJECT types
- if (
- key == 'properties'
- and isinstance(cleaned_value, dict)
- and len(cleaned_value) == 0
- and isinstance(obj.get('type', ''), str)
- and obj.get('type', '').upper() == 'OBJECT'
- ):
- # Convert empty object to have at least one property
- cleaned['properties'] = {'_placeholder': {'type': 'string'}}
- else:
- cleaned[key] = cleaned_value
-
- # If this is an object type with empty properties, add a placeholder
- if (
- isinstance(cleaned.get('type', ''), str)
- and cleaned.get('type', '').upper() == 'OBJECT'
- and 'properties' in cleaned
- and isinstance(cleaned['properties'], dict)
- and len(cleaned['properties']) == 0
- ):
- cleaned['properties'] = {'_placeholder': {'type': 'string'}}
-
- # PATCH: Also remove 'title' from the required list if it exists
- if 'required' in cleaned and isinstance(cleaned.get('required'), list):
- cleaned['required'] = [p for p in cleaned['required'] if p != 'title']
-
- return cleaned
- elif isinstance(obj, list):
- return [clean_schema(item) for item in obj]
- return obj
-
- return clean_schema(schema)
-
-def apply():
- """Applies the monkey-patch to ChatGoogle."""
- ChatGoogle._fix_gemini_schema = _patched_fix_gemini_schema
diff --git a/python/helpers/mcp_handler.py b/python/helpers/mcp_handler.py
index 1a16acb49e..2c44ded817 100644
--- a/python/helpers/mcp_handler.py
+++ b/python/helpers/mcp_handler.py
@@ -1112,4 +1112,4 @@ def get_session_id(self) -> Optional[str]:
"""Get the current session ID if available (for streaming HTTP clients)."""
if self.session_id_callback is not None:
return self.session_id_callback()
- return None
+ return None
\ No newline at end of file
diff --git a/python/helpers/mcp_server.py b/python/helpers/mcp_server.py
index 4c080da69c..0cbce8e6e2 100644
--- a/python/helpers/mcp_server.py
+++ b/python/helpers/mcp_server.py
@@ -430,4 +430,4 @@ async def mcp_middleware(request: Request, call_next):
status_code=403, detail="MCP server is disabled in settings."
)
- return await call_next(request)
+ return await call_next(request)
\ No newline at end of file
diff --git a/python/helpers/playwright.py b/python/helpers/playwright.py
index 34f851ab63..9ce743e37e 100644
--- a/python/helpers/playwright.py
+++ b/python/helpers/playwright.py
@@ -1,6 +1,8 @@
from pathlib import Path
import subprocess
+import sys
+import platform
from python.helpers import files
@@ -8,24 +10,98 @@
# should work for both docker and local installation
def get_playwright_binary():
+ """Get the Playwright Chromium binary path.
+
+ Looks for full Chromium browser first (supports both headless and visible mode),
+ falls back to headless shell if full browser not found.
+
+ Platform-aware: Only searches for binaries matching the current OS to prevent
+ attempting to run wrong-platform binaries (e.g., macOS binary in Linux Docker).
+ """
pw_cache = Path(get_playwright_cache_dir())
- headless_shell = next(pw_cache.glob("chromium_headless_shell-*/chrome-*/headless_shell"), None)
+
+ # Detect current platform
+ system = platform.system()
+
+ # Search for platform-specific full Chromium browser (supports visible mode)
+ full_browser = None
+ if system == "Darwin": # macOS
+ full_browser = next(pw_cache.glob("chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium"), None)
+ elif system == "Linux":
+ full_browser = next(pw_cache.glob("chromium-*/chrome-linux/chrome"), None)
+ elif system == "Windows":
+ full_browser = next(pw_cache.glob("chromium-*/chrome-win/chrome.exe"), None)
+
+ if full_browser:
+ return full_browser
+
+ # Fallback to platform-specific headless shell (headless-only, can't show GUI)
+ headless_shell = None
+ if system == "Darwin": # macOS
+ headless_shell = next(pw_cache.glob("chromium_headless_shell-*/chrome-mac/headless_shell"), None)
+ elif system == "Linux":
+ headless_shell = next(pw_cache.glob("chromium_headless_shell-*/chrome-linux/headless_shell"), None)
+ elif system == "Windows":
+ headless_shell = next(pw_cache.glob("chromium_headless_shell-*/chrome-win/headless_shell.exe"), None)
+
return headless_shell
def get_playwright_cache_dir():
return files.get_abs_path("tmp/playwright")
def ensure_playwright_binary():
+ """Ensure Playwright browser is installed.
+
+ Installs full Chromium browser (supports both visible and headless modes).
+ Falls back to headless shell only if full browser installation fails.
+
+ Cleans up wrong-platform binaries if found (e.g., macOS binary in Linux Docker).
+ """
+ import os
+ import shutil
+
bin = get_playwright_binary()
if not bin:
cache = get_playwright_cache_dir()
- import os
+ pw_cache = Path(cache)
+
+ # Clean up wrong-platform binaries to avoid confusion and save space
+ system = platform.system()
+ wrong_platform_dirs = []
+
+ if system != "Darwin": # Not macOS - remove macOS binaries
+ wrong_platform_dirs.extend(pw_cache.glob("chromium-*/chrome-mac"))
+ if system != "Linux": # Not Linux - remove Linux binaries
+ wrong_platform_dirs.extend(pw_cache.glob("chromium-*/chrome-linux"))
+ if system != "Windows": # Not Windows - remove Windows binaries
+ wrong_platform_dirs.extend(pw_cache.glob("chromium-*/chrome-win"))
+
+ for wrong_dir in wrong_platform_dirs:
+ print(f"Removing wrong-platform binary: {wrong_dir}")
+ # Remove the entire chromium-* directory, not just the platform subdirectory
+ chromium_dir = wrong_dir.parent
+ if chromium_dir.exists():
+ shutil.rmtree(chromium_dir)
+
env = os.environ.copy()
env["PLAYWRIGHT_BROWSERS_PATH"] = cache
- subprocess.check_call(
- ["playwright", "install", "chromium", "--only-shell"],
- env=env
- )
+
+ # Install full Chromium browser (supports both visible and headless modes)
+ print(f"Installing Playwright Chromium browser for {system} (supports visible mode)...")
+ try:
+ subprocess.check_call(
+ [sys.executable, "-m", "playwright", "install", "chromium"],
+ env=env
+ )
+ except subprocess.CalledProcessError as e:
+ print(f"Failed to install full Chromium: {e}")
+ print("Falling back to headless shell (headless-only)...")
+ # Fallback: install headless shell only
+ subprocess.check_call(
+ [sys.executable, "-m", "playwright", "install", "chromium", "--only-shell"],
+ env=env
+ )
+
bin = get_playwright_binary()
if not bin:
raise Exception("Playwright binary not found after installation")
diff --git a/python/helpers/settings.py b/python/helpers/settings.py
index d882de94c9..a7a790d436 100644
--- a/python/helpers/settings.py
+++ b/python/helpers/settings.py
@@ -46,14 +46,6 @@ class Settings(TypedDict):
embed_model_rl_requests: int
embed_model_rl_input: int
- browser_model_provider: str
- browser_model_name: str
- browser_model_api_base: str
- browser_model_vision: bool
- browser_model_rl_requests: int
- browser_model_rl_input: int
- browser_model_rl_output: int
- browser_model_kwargs: dict[str, Any]
browser_http_headers: dict[str, Any]
agent_profile: str
@@ -429,106 +421,6 @@ def convert_out(settings: Settings) -> SettingsOutput:
"tab": "agent",
}
- # embedding model section
- browser_model_fields: list[SettingsField] = []
- browser_model_fields.append(
- {
- "id": "browser_model_provider",
- "title": "Web Browser model provider",
- "description": "Select provider for web browser model used by browser-use framework",
- "type": "select",
- "value": settings["browser_model_provider"],
- "options": cast(list[FieldOption], get_providers("chat")),
- }
- )
- browser_model_fields.append(
- {
- "id": "browser_model_name",
- "title": "Web Browser model name",
- "description": "Exact name of model from selected provider",
- "type": "text",
- "value": settings["browser_model_name"],
- }
- )
-
- browser_model_fields.append(
- {
- "id": "browser_model_api_base",
- "title": "Web Browser model API base URL",
- "description": "API base URL for web browser model. Leave empty for default. Only relevant for Azure, local and custom (other) providers.",
- "type": "text",
- "value": settings["browser_model_api_base"],
- }
- )
-
- browser_model_fields.append(
- {
- "id": "browser_model_vision",
- "title": "Use Vision",
- "description": "Models capable of Vision can use it to analyze web pages from screenshots. Increases quality but also token usage.",
- "type": "switch",
- "value": settings["browser_model_vision"],
- }
- )
-
- browser_model_fields.append(
- {
- "id": "browser_model_rl_requests",
- "title": "Web Browser model rate limit requests",
- "description": "Rate limit requests for web browser model.",
- "type": "number",
- "value": settings["browser_model_rl_requests"],
- }
- )
-
- browser_model_fields.append(
- {
- "id": "browser_model_rl_input",
- "title": "Web Browser model rate limit input",
- "description": "Rate limit input for web browser model.",
- "type": "number",
- "value": settings["browser_model_rl_input"],
- }
- )
-
- browser_model_fields.append(
- {
- "id": "browser_model_rl_output",
- "title": "Web Browser model rate limit output",
- "description": "Rate limit output for web browser model.",
- "type": "number",
- "value": settings["browser_model_rl_output"],
- }
- )
-
- browser_model_fields.append(
- {
- "id": "browser_model_kwargs",
- "title": "Web Browser model additional parameters",
- "description": "Any other parameters supported by LiteLLM. Format is KEY=VALUE on individual lines, like .env file. Value can also contain JSON objects - when unquoted, it is treated as object, number etc., when quoted, it is treated as string.",
- "type": "textarea",
- "value": _dict_to_env(settings["browser_model_kwargs"]),
- }
- )
-
- browser_model_fields.append(
- {
- "id": "browser_http_headers",
- "title": "HTTP Headers",
- "description": "HTTP headers to include with all browser requests. Format is KEY=VALUE on individual lines, like .env file. Value can also contain JSON objects - when unquoted, it is treated as object, number etc., when quoted, it is treated as string. Example: Authorization=Bearer token123",
- "type": "textarea",
- "value": _dict_to_env(settings.get("browser_http_headers", {})),
- }
- )
-
- browser_model_section: SettingsSection = {
- "id": "browser_model",
- "title": "Web Browser Model",
- "description": "Settings for the web browser model. Agent Zero uses browser-use agentic framework to handle web interactions.",
- "fields": browser_model_fields,
- "tab": "agent",
- }
-
# basic auth section
auth_fields: list[SettingsField] = []
@@ -1257,7 +1149,6 @@ def convert_out(settings: Settings) -> SettingsOutput:
agent_section,
chat_model_section,
util_model_section,
- browser_model_section,
embed_model_section,
memory_section,
speech_section,
@@ -1451,14 +1342,6 @@ def get_default_settings() -> Settings:
embed_model_kwargs={},
embed_model_rl_requests=0,
embed_model_rl_input=0,
- browser_model_provider="openrouter",
- browser_model_name="openai/gpt-4.1",
- browser_model_api_base="",
- browser_model_vision=True,
- browser_model_rl_requests=0,
- browser_model_rl_input=0,
- browser_model_rl_output=0,
- browser_model_kwargs={"temperature": "0"},
browser_http_headers={},
memory_recall_enabled=True,
memory_recall_delayed=False,
diff --git a/python/helpers/vector_db.py b/python/helpers/vector_db.py
index 2b94960e31..c68c517d17 100644
--- a/python/helpers/vector_db.py
+++ b/python/helpers/vector_db.py
@@ -147,4 +147,4 @@ def comparator(data: dict[str, Any]):
# PrintStyle.error(f"Error evaluating condition: {e}")
return False
- return comparator
+ return comparator
\ No newline at end of file
diff --git a/python/tools/browser_agent.py b/python/tools/browser_agent.py
deleted file mode 100644
index 6d5f085b26..0000000000
--- a/python/tools/browser_agent.py
+++ /dev/null
@@ -1,428 +0,0 @@
-import asyncio
-import time
-from typing import Optional, cast
-from agent import Agent, InterventionException
-from pathlib import Path
-
-from python.helpers.tool import Tool, Response
-from python.helpers import files, defer, persist_chat, strings
-from python.helpers.browser_use import browser_use # type: ignore[attr-defined]
-from python.helpers.print_style import PrintStyle
-from python.helpers.playwright import ensure_playwright_binary
-from python.helpers.secrets import get_secrets_manager
-from python.extensions.message_loop_start._10_iteration_no import get_iter_no
-from pydantic import BaseModel
-import uuid
-from python.helpers.dirty_json import DirtyJson
-
-
-class State:
- @staticmethod
- async def create(agent: Agent):
- state = State(agent)
- return state
-
- def __init__(self, agent: Agent):
- self.agent = agent
- self.browser_session: Optional[browser_use.BrowserSession] = None
- self.task: Optional[defer.DeferredTask] = None
- self.use_agent: Optional[browser_use.Agent] = None
- self.secrets_dict: Optional[dict[str, str]] = None
- self.iter_no = 0
-
- def __del__(self):
- self.kill_task()
- files.delete_dir(self.get_user_data_dir()) # cleanup user data dir
-
- def get_user_data_dir(self):
- return str(
- Path.home()
- / ".config"
- / "browseruse"
- / "profiles"
- / f"agent_{self.agent.context.id}"
- )
-
- async def _initialize(self):
- if self.browser_session:
- return
-
- # for some reason we need to provide exact path to headless shell, otherwise it looks for headed browser
- pw_binary = ensure_playwright_binary()
-
- self.browser_session = browser_use.BrowserSession(
- browser_profile=browser_use.BrowserProfile(
- headless=True,
- disable_security=True,
- chromium_sandbox=False,
- accept_downloads=True,
- downloads_path=files.get_abs_path("tmp/downloads"),
- allowed_domains=["*", "http://*", "https://*"],
- executable_path=pw_binary,
- keep_alive=True,
- minimum_wait_page_load_time=1.0,
- wait_for_network_idle_page_load_time=2.0,
- maximum_wait_page_load_time=10.0,
- window_size={"width": 1024, "height": 2048},
- screen={"width": 1024, "height": 2048},
- viewport={"width": 1024, "height": 2048},
- no_viewport=False,
- args=["--headless=new"],
- # Use a unique user data directory to avoid conflicts
- user_data_dir=self.get_user_data_dir(),
- extra_http_headers=self.agent.config.browser_http_headers or {},
- )
- )
-
- await self.browser_session.start() if self.browser_session else None
- # self.override_hooks()
-
- # --------------------------------------------------------------------------
- # Patch to enforce vertical viewport size
- # --------------------------------------------------------------------------
- # Browser-use auto-configuration overrides viewport settings, causing wrong
- # aspect ratio. We fix this by directly setting viewport size after startup.
- # --------------------------------------------------------------------------
-
- if self.browser_session:
- try:
- page = await self.browser_session.get_current_page()
- if page:
- await page.set_viewport_size({"width": 1024, "height": 2048})
- except Exception as e:
- PrintStyle().warning(f"Could not force set viewport size: {e}")
-
- # --------------------------------------------------------------------------
-
- # Add init script to the browser session
- if self.browser_session and self.browser_session.browser_context:
- js_override = files.get_abs_path("lib/browser/init_override.js")
- await self.browser_session.browser_context.add_init_script(path=js_override) if self.browser_session else None
-
- def start_task(self, task: str):
- if self.task and self.task.is_alive():
- self.kill_task()
-
- self.task = defer.DeferredTask(
- thread_name="BrowserAgent" + self.agent.context.id
- )
- if self.agent.context.task:
- self.agent.context.task.add_child_task(self.task, terminate_thread=True)
- self.task.start_task(self._run_task, task) if self.task else None
- return self.task
-
- def kill_task(self):
- if self.task:
- self.task.kill(terminate_thread=True)
- self.task = None
- if self.browser_session:
- try:
- import asyncio
-
- loop = asyncio.new_event_loop()
- asyncio.set_event_loop(loop)
- loop.run_until_complete(self.browser_session.close()) if self.browser_session else None
- loop.close()
- except Exception as e:
- PrintStyle().error(f"Error closing browser session: {e}")
- finally:
- self.browser_session = None
- self.use_agent = None
- self.iter_no = 0
-
- async def _run_task(self, task: str):
- await self._initialize()
-
- class DoneResult(BaseModel):
- title: str
- response: str
- page_summary: str
-
- # Initialize controller
- controller = browser_use.Controller(output_model=DoneResult)
-
- # Register custom completion action with proper ActionResult fields
- @controller.registry.action("Complete task", param_model=DoneResult)
- async def complete_task(params: DoneResult):
- result = browser_use.ActionResult(
- is_done=True, success=True, extracted_content=params.model_dump_json()
- )
- return result
-
- model = self.agent.get_browser_model()
-
- try:
-
- secrets_manager = get_secrets_manager(self.agent.context)
- secrets_dict = secrets_manager.load_secrets()
-
- self.use_agent = browser_use.Agent(
- task=task,
- browser_session=self.browser_session,
- llm=model,
- use_vision=self.agent.config.browser_model.vision,
- extend_system_message=self.agent.read_prompt(
- "prompts/browser_agent.system.md"
- ),
- controller=controller,
- enable_memory=False, # Disable memory to avoid state conflicts
- llm_timeout=3000, # TODO rem
- sensitive_data=cast(dict[str, str | dict[str, str]] | None, secrets_dict or {}), # Pass secrets
- )
- except Exception as e:
- raise Exception(
- f"Browser agent initialization failed. This might be due to model compatibility issues. Error: {e}"
- ) from e
-
- self.iter_no = get_iter_no(self.agent)
-
- async def hook(agent: browser_use.Agent):
- await self.agent.wait_if_paused()
- if self.iter_no != get_iter_no(self.agent):
- raise InterventionException("Task cancelled")
-
- # try:
- result = None
- if self.use_agent:
- result = await self.use_agent.run(
- max_steps=50, on_step_start=hook, on_step_end=hook
- )
- return result
-
- async def get_page(self):
- if self.use_agent and self.browser_session:
- try:
- return await self.use_agent.browser_session.get_current_page() if self.use_agent.browser_session else None
- except Exception:
- # Browser session might be closed or invalid
- return None
- return None
-
- async def get_selector_map(self):
- """Get the selector map for the current page state."""
- if self.use_agent:
- await self.use_agent.browser_session.get_state_summary(cache_clickable_elements_hashes=True) if self.use_agent.browser_session else None
- return await self.use_agent.browser_session.get_selector_map() if self.use_agent.browser_session else None
- await self.use_agent.browser_session.get_state_summary(
- cache_clickable_elements_hashes=True
- )
- return await self.use_agent.browser_session.get_selector_map()
- return {}
-
-
-class BrowserAgent(Tool):
-
- async def execute(self, message="", reset="", **kwargs):
- self.guid = self.agent.context.generate_id() # short random id
- reset = str(reset).lower().strip() == "true"
- await self.prepare_state(reset=reset)
- message = get_secrets_manager(self.agent.context).mask_values(message, placeholder="
Connecting to browser...
+