diff --git a/DockerfileLocal b/DockerfileLocal index f934d97498..c7a92be406 100644 --- a/DockerfileLocal +++ b/DockerfileLocal @@ -33,4 +33,4 @@ EXPOSE 22 80 9000-9009 RUN chmod +x /exe/initialize.sh /exe/run_A0.sh /exe/run_searxng.sh /exe/run_tunnel_api.sh # initialize runtime and switch to supervisord -CMD ["/exe/initialize.sh", "$BRANCH"] +CMD ["/exe/initialize.sh", "$BRANCH"] \ No newline at end of file diff --git a/agent.py b/agent.py index 594dc37bc5..4f1ea4863d 100644 --- a/agent.py +++ b/agent.py @@ -275,7 +275,6 @@ class AgentConfig: chat_model: models.ModelConfig utility_model: models.ModelConfig embeddings_model: models.ModelConfig - browser_model: models.ModelConfig mcp_servers: str profile: str = "" memory_subdir: str = "" @@ -287,7 +286,12 @@ class AgentConfig: code_exec_ssh_user: str = "root" code_exec_ssh_pass: str = "" additional: Dict[str, Any] = field(default_factory=dict) - + browser_control_headless: bool = False # Browser GUI enabled for interaction (uses VNC if available, otherwise X11 forwarding) + browser_control_cdp_url: str = "" # Chrome DevTools Protocol URL for native browser (e.g., "ws://host.docker.internal:9222/devtools/browser/..."), leave empty to use embedded browser with VNC + browser_control_start_url: str = "https://www.google.com" + browser_control_timeout: int = 5000 # milliseconds + # VNC is automatically enabled if available (configured in docker-compose.yml) + # Access browser control via noVNC when agent calls pause_for_user method @dataclass class UserMessage: @@ -676,14 +680,6 @@ def get_utility_model(self): **self.config.utility_model.build_kwargs(), ) - def get_browser_model(self): - return models.get_browser_model( - self.config.browser_model.provider, - self.config.browser_model.name, - model_config=self.config.browser_model, - **self.config.browser_model.build_kwargs(), - ) - def get_embedding_model(self): return models.get_embedding_model( self.config.embeddings_model.provider, diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 7e94ed80a5..79dfaab174 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -27,6 +27,12 @@ RUN bash /ins/install_base_packages4.sh # install python after packages to ensure version overriding RUN bash /ins/install_python.sh +# install X11 support for browser display +RUN bash /ins/install_x11_support.sh + +# install VNC server and noVNC for remote browser control +RUN bash /ins/install_vnc.sh + # install searxng RUN bash /ins/install_searxng.sh diff --git a/docker/run/docker-compose.yml b/docker/run/docker-compose.yml index cc48f3f1ba..a90a70f71d 100644 --- a/docker/run/docker-compose.yml +++ b/docker/run/docker-compose.yml @@ -1,8 +1,53 @@ services: agent-zero: container_name: agent-zero - image: agent0ai/agent-zero:latest + # Use local development image (build with: docker build -f DockerfileLocal -t agent-zero-local --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) .) + image: agent-zero-local + # Use Docker Hub image for production deployments + # image: agent0ai/agent-zero:latest volumes: - - ./agent-zero:/a0 + # Mount the actual project root (not the outdated copy in ./agent-zero) + # This allows live development - changes reflected immediately without rebuild + - ../..:/a0 + # X11 socket for GUI display on macOS (auto-configured) + - /tmp/.X11-unix:/tmp/.X11-unix:rw ports: - - "50080:80" \ No newline at end of file + - "55022:22" + - "50080:80" + - "56080:6080" # noVNC web client for browser control + - "50090:9000" + - "50091:9001" + - "50092:9002" + - "50093:9003" + - "50094:9004" + - "50095:9005" + - "50096:9006" + - "50097:9007" + - "50098:9008" + - "50099:9009" + environment: + # X11 display forwarding via TCP (Docker Desktop on macOS uses VM) + - DISPLAY=host.docker.internal:0 + - XAUTHORITY=/tmp/.Xauthority + # VNC configuration for remote browser control + - VNC_DISPLAY=:99 + - VNC_RESOLUTION=1920x1080x24 + - VNC_PORT=5900 + - NOVNC_PORT=6080 + - NOVNC_EXTERNAL_PORT=56080 # External port mapping for noVNC access + - VNC_PASSWORD=agent-zero + # Allow container to reach host for X11 + extra_hosts: + - "host.docker.internal:host-gateway" + # Security options for X11 + security_opt: + - seccomp:unconfined + # Shared memory for browser (required for Chromium) + shm_size: '2gb' + # Auto-check and setup display and VNC on startup + command: > + bash -c " + /exe/check_display.sh || true && + /exe/start_vnc.sh || true && + /exe/initialize.sh development + " \ No newline at end of file diff --git a/docker/run/fs/etc/supervisor/conf.d/vnc.conf b/docker/run/fs/etc/supervisor/conf.d/vnc.conf new file mode 100644 index 0000000000..e20409b2da --- /dev/null +++ b/docker/run/fs/etc/supervisor/conf.d/vnc.conf @@ -0,0 +1,13 @@ +[program:run_vnc] +command=/exe/start_vnc.sh +environment= +user=root +stopwaitsecs=10 +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +autorestart=true +startretries=3 +stopasgroup=true +killasgroup=true diff --git a/docker/run/fs/exe/check_display.sh b/docker/run/fs/exe/check_display.sh new file mode 100755 index 0000000000..d0b2e7813a --- /dev/null +++ b/docker/run/fs/exe/check_display.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# Automatic X11 display setup checker +# Runs on container startup to verify display forwarding +# No user interaction required - fully automatic + +set -e + +echo "========================================" +echo "Agent Zero - Display Setup Check" +echo "========================================" + +# Detect if running on macOS host +IS_MACOS=false +if [ -f /tmp/.X11-unix ] || [ "$DISPLAY" = "host.docker.internal:0" ]; then + IS_MACOS=true +fi + +# Check if DISPLAY is set +if [ -z "$DISPLAY" ]; then + echo "⚠️ No display configured (headless mode)" + echo " Browser will run in headless mode (invisible)" + echo "" + echo "To enable visible browser on macOS:" + echo " 1. Install XQuartz: https://www.xquartz.org/" + echo " 2. Start XQuartz and restart Agent Zero" + exit 0 +fi + +# Display is configured - verify X11 libraries +echo "✓ Display configured: $DISPLAY" + +# Check if X11 libraries are installed +if ! dpkg -l | grep -q libx11-6; then + echo "Installing X11 libraries for browser display..." + apt-get update -qq + DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ + libx11-6 libxcb1 libxcomposite1 libxcursor1 libxdamage1 \ + libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 \ + libxtst6 libgbm1 libasound2 libatk1.0-0 libatk-bridge2.0-0 \ + libcups2 libdrm2 libgtk-3-0 libnspr4 libnss3 \ + 2>&1 | grep -v "^Reading" | grep -v "^Building" || true +fi + +echo "✓ X11 libraries installed" + +# Test X11 connection +if [ "$IS_MACOS" = true ]; then + echo "Testing X11 connection to macOS host..." + + # Try to connect to X11 + timeout 2 xdpyinfo -display "$DISPLAY" > /dev/null 2>&1 && { + echo "✓ X11 connection successful" + echo "✓ Browser will appear on your screen" + exit 0 + } || { + echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo "⚠️ Cannot connect to X11 display" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo "" + echo "To see the browser window, you need XQuartz:" + echo "" + echo " 1. Download and install XQuartz:" + echo " https://www.xquartz.org/" + echo "" + echo " 2. Log out and log back in (required!)" + echo "" + echo " 3. Allow Docker connections:" + echo " xhost +localhost" + echo "" + echo " 4. Restart Agent Zero:" + echo " cd docker/run && docker-compose restart" + echo "" + echo "For now, browser will run in headless mode." + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo "" + exit 0 + } +fi + +echo "✓ Display setup complete" +echo "========================================" diff --git a/docker/run/fs/exe/initialize.sh b/docker/run/fs/exe/initialize.sh index 8c329bb304..ba6b5d9cc3 100644 --- a/docker/run/fs/exe/initialize.sh +++ b/docker/run/fs/exe/initialize.sh @@ -19,5 +19,8 @@ chmod 444 /root/.profile # update package list to save time later apt-get update > /dev/null 2>&1 & +# Start VNC server in the background (for browser control feature) +/exe/start_vnc.sh > /tmp/vnc_startup.log 2>&1 & + # let supervisord handle the services exec /usr/bin/supervisord -c /etc/supervisor/conf.d/supervisord.conf diff --git a/docker/run/fs/exe/start_vnc.sh b/docker/run/fs/exe/start_vnc.sh new file mode 100755 index 0000000000..906e5f43dc --- /dev/null +++ b/docker/run/fs/exe/start_vnc.sh @@ -0,0 +1,203 @@ +#!/bin/bash +# VNC Server Startup Script +# Starts Xvfb, x11vnc, and noVNC for remote browser control +# Can be safely run multiple times (idempotent) + +set -e + +echo "========================================" +echo "Agent Zero - VNC Server Setup" +echo "========================================" + +# Configuration from environment variables with defaults +VNC_DISPLAY="${VNC_DISPLAY:-:99}" +VNC_RESOLUTION="${VNC_RESOLUTION:-1920x1080x24}" +VNC_PORT="${VNC_PORT:-5900}" +NOVNC_PORT="${NOVNC_PORT:-6080}" +VNC_PASSWORD="${VNC_PASSWORD:-agent-zero}" + +# Extract display number (e.g., :99 -> 99) +DISPLAY_NUM=$(echo $VNC_DISPLAY | tr -d ':') + +echo "Configuration:" +echo " Display: $VNC_DISPLAY" +echo " Resolution: $VNC_RESOLUTION" +echo " VNC Port: $VNC_PORT" +echo " noVNC Port: $NOVNC_PORT" +echo "========================================" + +# Function to check if a process is running +is_running() { + pgrep -f "$1" > /dev/null 2>&1 +} + +# Function to kill existing VNC processes +cleanup_vnc() { + echo "Cleaning up existing VNC processes..." + pkill -f "Xvfb $VNC_DISPLAY" || true + pkill -f "x11vnc.*$VNC_DISPLAY" || true + pkill -f "websockify.*$NOVNC_PORT" || true + # Remove stale lock file (socket file removal may fail, but that's OK) + rm -f /tmp/.X${DISPLAY_NUM}-lock 2>/dev/null || true + rm -f /tmp/.X11-unix/X${DISPLAY_NUM} 2>/dev/null || true + sleep 1 +} + +# Check if already running - if so, skip to monitoring +if is_running "Xvfb $VNC_DISPLAY" && is_running "x11vnc.*$VNC_DISPLAY" && is_running "websockify.*$NOVNC_PORT"; then + echo "✓ VNC server already running" + echo " - Xvfb on display $VNC_DISPLAY" + echo " - x11vnc on port $VNC_PORT" + echo " - noVNC web client on port $NOVNC_PORT" + echo "========================================" + + # Skip to monitoring instead of exiting + # Find PIDs of running processes + XVFB_PID=$(pgrep -f "Xvfb $VNC_DISPLAY" | head -1) + X11VNC_PID=$(pgrep -f "x11vnc.*$VNC_DISPLAY" | head -1) + WEBSOCKIFY_PID=$(pgrep -f "websockify.*$NOVNC_PORT" | head -1) + + # Create status file + mkdir -p /tmp/vnc + echo "DISPLAY=$VNC_DISPLAY" > /tmp/vnc/status + echo "VNC_PORT=$VNC_PORT" >> /tmp/vnc/status + echo "NOVNC_PORT=$NOVNC_PORT" >> /tmp/vnc/status + echo "XVFB_PID=$XVFB_PID" >> /tmp/vnc/status + echo "X11VNC_PID=$X11VNC_PID" >> /tmp/vnc/status + echo "WEBSOCKIFY_PID=$WEBSOCKIFY_PID" >> /tmp/vnc/status + echo "READY=true" >> /tmp/vnc/status + + # Jump to monitoring loop + # Use a label/goto simulation by setting a flag + SKIP_STARTUP=true +else + SKIP_STARTUP=false +fi + +# Only run startup if not skipping +if [ "$SKIP_STARTUP" = "false" ]; then + +# Clean up any partial VNC processes +cleanup_vnc + +# Create VNC password file +mkdir -p /root/.vnc +echo "Setting VNC password..." +x11vnc -storepasswd "$VNC_PASSWORD" /root/.vnc/passwd 2>/dev/null || { + echo "⚠️ Failed to set VNC password, trying alternative method..." + # Alternative method using printf and stdin + printf "%s\n%s\n" "$VNC_PASSWORD" "$VNC_PASSWORD" | x11vnc -storepasswd /root/.vnc/passwd 2>/dev/null || { + echo "⚠️ Password setup failed, VNC may not be accessible" + } +} + +# Start Xvfb (X virtual framebuffer) +echo "Starting Xvfb on display $VNC_DISPLAY..." +Xvfb $VNC_DISPLAY -screen 0 $VNC_RESOLUTION -ac +extension GLX +render -noreset > /tmp/xvfb.log 2>&1 & +XVFB_PID=$! + +# Wait for Xvfb to be ready +sleep 2 + +if ! is_running "Xvfb $VNC_DISPLAY"; then + echo "❌ Failed to start Xvfb" + cat /tmp/xvfb.log + exit 1 +fi + +echo "✓ Xvfb started successfully (PID: $XVFB_PID)" + +# Start x11vnc (VNC server) +echo "Starting x11vnc on port $VNC_PORT..." +x11vnc \ + -display $VNC_DISPLAY \ + -rfbport $VNC_PORT \ + -rfbauth /root/.vnc/passwd \ + -forever \ + -shared \ + -noxdamage \ + -ncache 10 \ + -ncache_cr \ + -localhost \ + -quiet \ + > /tmp/x11vnc.log 2>&1 & +X11VNC_PID=$! + +# Wait for x11vnc to be ready +sleep 2 + +if ! is_running "x11vnc.*$VNC_DISPLAY"; then + echo "❌ Failed to start x11vnc" + cat /tmp/x11vnc.log + exit 1 +fi + +echo "✓ x11vnc started successfully (PID: $X11VNC_PID)" + +# Find noVNC installation +NOVNC_PATH="" +if [ -d "/opt/novnc" ]; then + NOVNC_PATH="/opt/novnc" +elif [ -d "/usr/share/novnc" ]; then + NOVNC_PATH="/usr/share/novnc" +elif [ -d "/usr/share/noVNC" ]; then + NOVNC_PATH="/usr/share/noVNC" +fi + +if [ -z "$NOVNC_PATH" ]; then + echo "⚠️ noVNC not found, VNC server running but no web access" + echo " You can still connect with a VNC client on port $VNC_PORT" + echo "========================================" + exit 0 +fi + +# Start websockify for noVNC +echo "Starting noVNC web client on port $NOVNC_PORT..." +websockify \ + --web=$NOVNC_PATH \ + $NOVNC_PORT \ + localhost:$VNC_PORT \ + > /tmp/websockify.log 2>&1 & +WEBSOCKIFY_PID=$! + +# Wait for websockify to be ready +sleep 2 + +if ! is_running "websockify.*$NOVNC_PORT"; then + echo "⚠️ Failed to start websockify/noVNC" + cat /tmp/websockify.log + echo " VNC server is running, but web access unavailable" +else + echo "✓ noVNC started successfully (PID: $WEBSOCKIFY_PID)" + echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo "🎉 VNC Server Ready!" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo "" + echo " Web Access: http://localhost:$NOVNC_PORT/vnc.html" + echo " VNC Client: localhost:$DISPLAY_NUM (port $VNC_PORT)" + echo " Password: $VNC_PASSWORD" + echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +fi + +echo "========================================" + +# Create status file for other scripts to check (only if we started VNC) +if [ "$SKIP_STARTUP" = "false" ]; then + mkdir -p /tmp/vnc + echo "DISPLAY=$VNC_DISPLAY" > /tmp/vnc/status + echo "VNC_PORT=$VNC_PORT" >> /tmp/vnc/status + echo "NOVNC_PORT=$NOVNC_PORT" >> /tmp/vnc/status + echo "XVFB_PID=$XVFB_PID" >> /tmp/vnc/status + echo "X11VNC_PID=$X11VNC_PID" >> /tmp/vnc/status + echo "WEBSOCKIFY_PID=$WEBSOCKIFY_PID" >> /tmp/vnc/status + echo "READY=true" >> /tmp/vnc/status +fi + +# Close the startup section +fi + +# VNC is now running in the background +# Exit the script so initialize.sh can continue +exit 0 diff --git a/docker/run/fs/ins/install_vnc.sh b/docker/run/fs/ins/install_vnc.sh new file mode 100755 index 0000000000..fdf854f41a --- /dev/null +++ b/docker/run/fs/ins/install_vnc.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Install VNC server and noVNC for remote browser control +# This allows users to manually interact with the browser when the agent pauses + +set -e + +echo "Installing VNC server and noVNC..." + +# Update package list +apt-get update + +# Install Xvfb (X virtual framebuffer) for headless display +# Install x11vnc for VNC server +# Install websockify for WebSocket support (required by noVNC) +# Install novnc for web-based VNC client +DEBIAN_FRONTEND=noninteractive apt-get install -y \ + xvfb \ + x11vnc \ + websockify \ + novnc \ + net-tools \ + procps + +# Create VNC directory for password and configuration +mkdir -p /root/.vnc + +# Set default VNC password (will be overridden by environment variable) +# Using x11vnc password format - pass password as argument +x11vnc -storepasswd "agent-zero" /root/.vnc/passwd 2>/dev/null || true + +# Create symlink for noVNC to easily find it +# noVNC is typically installed in /usr/share/novnc +if [ -d "/usr/share/novnc" ]; then + ln -sf /usr/share/novnc /opt/novnc +elif [ -d "/usr/share/noVNC" ]; then + ln -sf /usr/share/noVNC /opt/novnc +fi + +# Clean up +apt-get clean +rm -rf /var/lib/apt/lists/* + +echo "✓ VNC server and noVNC installed" +echo " - Xvfb for virtual display" +echo " - x11vnc for VNC server" +echo " - noVNC for web-based access" +echo " - Default VNC password: agent-zero (change via VNC_PASSWORD env var)" diff --git a/docker/run/fs/ins/install_x11_support.sh b/docker/run/fs/ins/install_x11_support.sh new file mode 100644 index 0000000000..6bbc4b5445 --- /dev/null +++ b/docker/run/fs/ins/install_x11_support.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Install X11 and GUI support for browser display +# This allows Chromium to display on the host machine via X11 forwarding + +set -e + +echo "Installing X11 and GUI support for browser display..." + +# Update package list +apt-get update + +# Install X11 libraries and dependencies for GUI applications +DEBIAN_FRONTEND=noninteractive apt-get install -y \ + libx11-6 \ + libx11-xcb1 \ + libxcb1 \ + libxcomposite1 \ + libxcursor1 \ + libxdamage1 \ + libxext6 \ + libxfixes3 \ + libxi6 \ + libxrandr2 \ + libxrender1 \ + libxss1 \ + libxtst6 \ + libxcb-dri3-0 \ + libxcb-shm0 \ + libxshmfence1 \ + libgbm1 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libgtk-3-0 \ + libnspr4 \ + libnss3 \ + libpango-1.0-0 \ + libpangocairo-1.0-0 \ + libglib2.0-0 \ + libdbus-1-3 \ + fonts-liberation \ + xdg-utils + +# Install additional fonts for better browser rendering +DEBIAN_FRONTEND=noninteractive apt-get install -y \ + fonts-noto \ + fonts-noto-cjk \ + fonts-noto-color-emoji + +# Clean up +apt-get clean +rm -rf /var/lib/apt/lists/* + +echo "✓ X11 and GUI support installed" diff --git a/initialize.py b/initialize.py index 3c42c952e5..8f77e40c13 100644 --- a/initialize.py +++ b/initialize.py @@ -60,21 +60,11 @@ def _normalize_model_kwargs(kwargs: dict) -> dict: limit_requests=current_settings["embed_model_rl_requests"], kwargs=_normalize_model_kwargs(current_settings["embed_model_kwargs"]), ) - # browser model from user settings - browser_llm = models.ModelConfig( - type=models.ModelType.CHAT, - provider=current_settings["browser_model_provider"], - name=current_settings["browser_model_name"], - api_base=current_settings["browser_model_api_base"], - vision=current_settings["browser_model_vision"], - kwargs=_normalize_model_kwargs(current_settings["browser_model_kwargs"]), - ) # agent configuration config = AgentConfig( chat_model=chat_llm, utility_model=utility_llm, embeddings_model=embedding_llm, - browser_model=browser_llm, profile=current_settings["agent_profile"], memory_subdir=current_settings["agent_memory_subdir"], knowledge_subdirs=[current_settings["agent_knowledge_subdir"], "default"], diff --git a/models.py b/models.py index 469925e49f..d155f275fa 100644 --- a/models.py +++ b/models.py @@ -25,7 +25,7 @@ from python.helpers.providers import get_provider_config from python.helpers.rate_limiter import RateLimiter from python.helpers.tokens import approximate_tokens -from python.helpers import dirty_json, browser_use_monkeypatch +from python.helpers import dirty_json from langchain_core.language_models.chat_models import SimpleChatModel from langchain_core.outputs.chat_generation import ChatGenerationChunk @@ -43,7 +43,7 @@ from sentence_transformers import SentenceTransformer -# disable extra logging, must be done repeatedly, otherwise browser-use will turn it back on for some reason +# disable extra logging def turn_off_logging(): os.environ["LITELLM_LOG"] = "ERROR" # only errors litellm.suppress_debug_info = True @@ -56,9 +56,8 @@ def turn_off_logging(): # init load_dotenv() turn_off_logging() -browser_use_monkeypatch.apply() -litellm.modify_params = True # helps fix anthropic tool calls by browser-use +litellm.modify_params = True # helps fix anthropic tool calls class ModelType(Enum): CHAT = "Chat" @@ -578,83 +577,6 @@ def __init__(self, wrapper, *args, **kwargs): self.chat = AsyncAIChatReplacement._Chat(wrapper) -from browser_use.llm import ChatOllama, ChatOpenRouter, ChatGoogle, ChatAnthropic, ChatGroq, ChatOpenAI - -class BrowserCompatibleChatWrapper(ChatOpenRouter): - """ - A wrapper for browser agent that can filter/sanitize messages - before sending them to the LLM. - """ - - def __init__(self, *args, **kwargs): - turn_off_logging() - # Create the underlying LiteLLM wrapper - self._wrapper = LiteLLMChatWrapper(*args, **kwargs) - # Browser-use may expect a 'model' attribute - self.model = self._wrapper.model_name - self.kwargs = self._wrapper.kwargs - - @property - def model_name(self) -> str: - return self._wrapper.model_name - - @property - def provider(self) -> str: - return self._wrapper.provider - - def get_client(self, *args, **kwargs): # type: ignore - return AsyncAIChatReplacement(self, *args, **kwargs) - - async def _acall( - self, - messages: List[BaseMessage], - stop: Optional[List[str]] = None, - run_manager: Optional[CallbackManagerForLLMRun] = None, - **kwargs: Any, - ): - # Apply rate limiting if configured - apply_rate_limiter_sync(self._wrapper.a0_model_conf, str(messages)) - - # Call the model - try: - model = kwargs.pop("model", None) - kwrgs = {**self._wrapper.kwargs, **kwargs} - - # hack from browser-use to fix json schema for gemini (additionalProperties, $defs, $ref) - if "response_format" in kwrgs and "json_schema" in kwrgs["response_format"] and model.startswith("gemini/"): - kwrgs["response_format"]["json_schema"] = ChatGoogle("")._fix_gemini_schema(kwrgs["response_format"]["json_schema"]) - - resp = await acompletion( - model=self._wrapper.model_name, - messages=messages, - stop=stop, - **kwrgs, - ) - - # Gemini: strip triple backticks and conform schema - try: - msg = resp.choices[0].message # type: ignore - if self.provider == "gemini" and isinstance(getattr(msg, "content", None), str): - cleaned = browser_use_monkeypatch.gemini_clean_and_conform(msg.content) # type: ignore - if cleaned: - msg.content = cleaned - except Exception: - pass - - except Exception as e: - raise e - - # another hack for browser-use post process invalid jsons - try: - if "response_format" in kwrgs and "json_schema" in kwrgs["response_format"] or "json_object" in kwrgs["response_format"]: - if resp.choices[0].message.content is not None and not resp.choices[0].message.content.startswith("{"): # type: ignore - js = dirty_json.parse(resp.choices[0].message.content) # type: ignore - resp.choices[0].message.content = dirty_json.stringify(js) # type: ignore - except Exception as e: - pass - - return resp - class LiteLLMEmbeddingWrapper(Embeddings): model_name: str kwargs: dict = {} @@ -899,16 +821,6 @@ def get_chat_model( ) -def get_browser_model( - provider: str, name: str, model_config: Optional[ModelConfig] = None, **kwargs: Any -) -> BrowserCompatibleChatWrapper: - orig = provider.lower() - provider_name, kwargs = _merge_provider_defaults("chat", orig, kwargs) - return _get_litellm_chat( - BrowserCompatibleChatWrapper, name, provider_name, model_config, **kwargs - ) - - def get_embedding_model( provider: str, name: str, model_config: Optional[ModelConfig] = None, **kwargs: Any ) -> LiteLLMEmbeddingWrapper | LocalSentenceTransformerWrapper: diff --git a/prompts/agent.system.tool.browser.md b/prompts/agent.system.tool.browser.md index 120316e155..f0a00c46ea 100644 --- a/prompts/agent.system.tool.browser.md +++ b/prompts/agent.system.tool.browser.md @@ -1,36 +1,186 @@ -### browser_agent: - -subordinate agent controls playwright browser -message argument talks to agent give clear instructions credentials task based -reset argument spawns new agent -do not reset if iterating -be precise descriptive like: open google login and end task, log in using ... and end task -when following up start: considering open pages -dont use phrase wait for instructions use end task -downloads default in /a0/tmp/downloads -pass secrets and variables in message when needed - -usage: -```json -{ - "thoughts": ["I need to log in to..."], - "headline": "Opening new browser session for login", - "tool_name": "browser_agent", - "tool_args": { - "message": "Open and log me into...", - "reset": "true" - } -} -``` - -```json -{ - "thoughts": ["I need to log in to..."], - "headline": "Continuing with existing browser session", - "tool_name": "browser_agent", - "tool_args": { - "message": "Considering open pages, click...", - "reset": "false" - } -} -``` +### browser_control + +granular browser control with individual actions +use for precise web automation tasks when browser_agent is too high-level +available methods: navigate, click, type, scroll, observe_page, select, press, hover, pause_for_user, get_browser_info +screenshots captured automatically after each action for visual feedback + +**navigate** - go to URL +**click** - click element by CSS selector or text +**type** - type text into input field +**scroll** - scroll page (direction: up/down/left/right) +**observe_page** - get current page state, title, content, elements (adds screenshot to context) +**select** - select option from dropdown +**press** - press keyboard key on element +**hover** - hover over element +**pause_for_user** - pause execution for manual user interaction (CAPTCHAs, manual login, etc.) + - requires browser to be in visible mode (headless=False) + - waits specified seconds for user to interact with browser + - use when encountering CAPTCHAs, blocked automation, or manual verification needed +**get_browser_info** - diagnostic tool to check browser visibility mode and troubleshoot + - shows current headless/visible mode + - displays configuration settings + - provides troubleshooting tips if browser not visible + - use when you can't see the browser window or need to verify settings + +session management: +- browser state persists across calls +- use reset arg to start fresh session +- same page context maintained between actions +- screenshots available in chat history + +usage examples: + +1. Navigate and observe +~~~json +{ + "thoughts": ["Need to open the website and see what's there"], + "headline": "Opening website", + "tool_name": "browser_control:navigate", + "tool_args": { + "url": "https://example.com" + } +} +~~~ + +2. Observe current page +~~~json +{ + "thoughts": ["Let me see what's on this page"], + "headline": "Observing page content", + "tool_name": "browser_control:observe_page", + "tool_args": {} +} +~~~ + +3. Click element +~~~json +{ + "thoughts": ["Need to click the login button"], + "headline": "Clicking login button", + "tool_name": "browser_control:click", + "tool_args": { + "selector": "button[type='submit']" + } +} +~~~ + +4. Type into field +~~~json +{ + "thoughts": ["Entering username"], + "headline": "Typing username", + "tool_name": "browser_control:type", + "tool_args": { + "selector": "input[name='username']", + "text": "myusername" + } +} +~~~ + +5. Scroll page +~~~json +{ + "thoughts": ["Need to see more content"], + "headline": "Scrolling down", + "tool_name": "browser_control:scroll", + "tool_args": { + "direction": "down" + } +} +~~~ + +6. Select dropdown option +~~~json +{ + "thoughts": ["Need to select country from dropdown"], + "headline": "Selecting country", + "tool_name": "browser_control:select", + "tool_args": { + "selector": "select[name='country']", + "value": "USA" + } +} +~~~ + +7. Press key +~~~json +{ + "thoughts": ["Need to submit form with Enter key"], + "headline": "Pressing Enter", + "tool_name": "browser_control:press", + "tool_args": { + "selector": "input[name='search']", + "key": "Enter" + } +} +~~~ + +8. Hover over element +~~~json +{ + "thoughts": ["Need to hover over menu to reveal submenu"], + "headline": "Hovering over menu", + "tool_name": "browser_control:hover", + "tool_args": { + "selector": "#main-menu" + } +} +~~~ + +9. Pause for user interaction +~~~json +{ + "thoughts": ["Encountered a CAPTCHA that needs manual solving"], + "headline": "Pausing for CAPTCHA", + "tool_name": "browser_control:pause_for_user", + "tool_args": { + "wait_seconds": 120, + "message": "Please solve the CAPTCHA" + } +} +~~~ + +10. Check browser visibility and settings +~~~json +{ + "thoughts": ["User says they can't see the browser window, let me check the configuration"], + "headline": "Checking browser settings", + "tool_name": "browser_control:get_browser_info", + "tool_args": {} +} +~~~ + +11. Reset session +~~~json +{ + "thoughts": ["Browser session seems stuck, starting fresh"], + "headline": "Resetting browser session", + "tool_name": "browser_control:navigate", + "tool_args": { + "url": "https://example.com", + "reset": "true" + } +} +~~~ + +**configuration:** +- to enable visible browser for manual interaction: set `browser_control_headless: False` in agent config +- default is headless mode (browser runs invisibly in background) +- visible mode required for pause_for_user to work +- start URL can be configured with `browser_control_start_url` +- timeout can be configured with `browser_control_timeout` (milliseconds) + +**best practices:** +- always observe_page first to understand current state +- use specific CSS selectors when possible (id, class, name attribute) +- for text-based clicking, selector will be treated as text content +- handle failures gracefully - try alternative selectors if needed +- reset session if browser gets stuck or navigation fails repeatedly +- each action is atomic - chain multiple actions for complex workflows +- screenshots show visual state after each action +- observe_page adds screenshot to your context for vision analysis +- use pause_for_user when encountering CAPTCHAs or automation blocks +- use get_browser_info when user reports browser visibility issues +- if browser was initialized in wrong mode, use reset=true to restart it + diff --git a/prompts/browser_agent.system.md b/prompts/browser_control.system.md similarity index 100% rename from prompts/browser_agent.system.md rename to prompts/browser_control.system.md diff --git a/python/api/browser_control.py b/python/api/browser_control.py new file mode 100644 index 0000000000..1899ff66b4 --- /dev/null +++ b/python/api/browser_control.py @@ -0,0 +1,81 @@ +from python.helpers.api import ApiHandler, Request, Response +from flask import send_file, redirect +import os + +class BrowserControl(ApiHandler): + """ + API endpoint for accessing the browser control interface (noVNC). + This allows users to manually interact with the browser when the agent pauses. + """ + + @classmethod + def requires_auth(cls) -> bool: + # Require authentication for browser control access + return True + + @classmethod + def requires_csrf(cls) -> bool: + # CSRF not needed for GET requests + return False + + @classmethod + def get_methods(cls) -> list[str]: + return ["GET"] + + async def process(self, input: dict, request: Request) -> dict | Response: + """ + Returns information about the VNC server and provides access to noVNC client. + + Query parameters: + - action: 'info' (default) | 'redirect' + - info: Returns VNC connection details + - redirect: Redirects to the noVNC web client + """ + action = request.args.get('action', 'info') + + # Check if VNC is running by reading status file + vnc_status_file = '/tmp/vnc/status' + vnc_ready = False + vnc_display = ':99' + novnc_port = '6080' + + # Get external port mapping from environment variable (for Docker port mapping) + # Default to 56080 which is the standard external mapping for noVNC port 6080 + external_novnc_port = os.environ.get('NOVNC_EXTERNAL_PORT', '56080') + + if os.path.exists(vnc_status_file): + try: + with open(vnc_status_file, 'r') as f: + status_lines = f.readlines() + status_dict = {} + for line in status_lines: + if '=' in line: + key, value = line.strip().split('=', 1) + status_dict[key] = value + + vnc_ready = status_dict.get('READY', 'false') == 'true' + vnc_display = status_dict.get('DISPLAY', ':99') + novnc_port = status_dict.get('NOVNC_PORT', '6080') + except Exception as e: + pass + + if action == 'redirect': + # Redirect to noVNC client using external port mapping with optimized parameters + novnc_url = f"http://localhost:{external_novnc_port}/vnc.html?autoconnect=true&resize=none&reconnect=true&reconnect_delay=1000&show_dot=true" + return redirect(novnc_url, code=302) + + # Default: return info with optimized noVNC URL parameters + # Parameters: + # - autoconnect: Connect automatically on load + # - resize=scale: Scale the remote session to fit the viewport + # - reconnect: Automatically reconnect if connection is lost + # - reconnect_delay: Wait 1 second before reconnecting + # - show_dot: Show connection status indicator + return { + "vnc_ready": vnc_ready, + "vnc_display": vnc_display, + "novnc_port": novnc_port, + "external_novnc_port": external_novnc_port, + "novnc_url": f"http://localhost:{external_novnc_port}/vnc.html?autoconnect=true&resize=none&reconnect=true&reconnect_delay=1000&show_dot=true", + "instructions": "Click the noVNC URL to access the browser control interface" if vnc_ready else "VNC server is not running" + } diff --git a/python/helpers/browser_control_client.py b/python/helpers/browser_control_client.py new file mode 100644 index 0000000000..7a891fada9 --- /dev/null +++ b/python/helpers/browser_control_client.py @@ -0,0 +1,602 @@ +""" +Browser Control Client - Playwright interface for browser automation. + +This module provides the PlaywrightClient for browser automation. +""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, List, Optional +import base64 + + +class ActionType(str, Enum): + """Supported action types for interface automation.""" + + CLICK = "click" + TYPE = "type" + SELECT = "select" + NAVIGATE = "navigate" + SCREENSHOT = "screenshot" + SCROLL = "scroll" + PRESS = "press" + HOVER = "hover" + PAUSE_FOR_USER = "pause_for_user" + + +@dataclass +class Action: + """Represents an action to be executed on an interface.""" + + action_type: ActionType + selector: Optional[str] = None + value: Optional[str] = None + coordinates: Optional[Dict[str, int]] = None + metadata: Dict[str, Any] = None + + def __post_init__(self): + if self.metadata is None: + self.metadata = {} + + +@dataclass +class ActionResult: + """Result of executing an action on an interface.""" + + success: bool + description: str + error: Optional[str] = None + screenshot: Optional[bytes] = None + task_complete: bool = False + metadata: Dict[str, Any] = None + + def __post_init__(self): + if self.metadata is None: + self.metadata = {} + + +@dataclass +class InterfaceState: + """Represents the current state of an interface.""" + + url: Optional[str] = None + title: Optional[str] = None + content: str = "" + interactive_elements: List[Dict[str, Any]] = None + screenshot: Optional[bytes] = None + metadata: Dict[str, Any] = None + + def __post_init__(self): + if self.interactive_elements is None: + self.interactive_elements = [] + if self.metadata is None: + self.metadata = {} + + +@dataclass +class BrowserControlState: + """State management for browser control tool.""" + + playwright: Optional[Any] = None + browser: Optional[Any] = None + context: Optional[Any] = None + page: Optional[Any] = None + client: Optional['PlaywrightClient'] = None + initialized: bool = False + + def __del__(self): + """Cleanup on deletion.""" + if self.initialized and self.client: + try: + import asyncio + # Try to close synchronously + try: + loop = asyncio.get_event_loop() + if loop.is_running(): + loop.create_task(self.client.close()) + else: + asyncio.run(self.client.close()) + except RuntimeError: + pass + except Exception: + # Silently fail - destructor shouldn't raise + pass + + +class PlaywrightClient: + """ + Web interface automation using Playwright. + + Provides browser automation capabilities for web applications. + """ + + def __init__( + self, + start_url: str = "https://www.google.com", + headless: bool = True, + playwright_binary: Optional[str] = None, + cdp_url: Optional[str] = None, + use_vnc: bool = False, + vnc_display: Optional[str] = None + ): + """ + Initialize Playwright web client. + + Args: + start_url: Initial URL to navigate to + headless: Whether to run browser in headless mode + playwright_binary: Path to Playwright binary (optional) + cdp_url: Chrome DevTools Protocol URL to connect to existing browser (optional) + e.g., "http://localhost:9222" or "http://host.docker.internal:9222" + use_vnc: Whether to use VNC display for browser visibility + vnc_display: VNC display number (e.g., ":99"). If None, read from VNC_DISPLAY env var + """ + self.start_url = start_url + self.headless = headless + self.playwright_binary = playwright_binary + self.cdp_url = cdp_url + self.use_vnc = use_vnc + self.vnc_display = vnc_display + self.playwright = None + self.browser = None + self.context = None + self.page = None + self.action_history = [] + + async def initialize(self) -> None: + """Initialize Playwright browser session.""" + try: + from playwright.async_api import async_playwright + except ImportError: + raise ImportError( + "Playwright is not installed. Install with: pip install playwright" + ) + + # Configure VNC display if enabled + if self.use_vnc: + import os + # Get VNC display from instance variable or environment + vnc_display = self.vnc_display or os.environ.get('VNC_DISPLAY', ':99') + # Set DISPLAY environment variable for Playwright to use VNC + original_display = os.environ.get('DISPLAY') + os.environ['DISPLAY'] = vnc_display + print(f"Using VNC display: {vnc_display}") + # Store original display to restore if needed + self._original_display = original_display + + self.playwright = await async_playwright().start() + + # Connect via CDP if URL provided (native browser mode) + if self.cdp_url: + print(f"Connecting to browser via CDP: {self.cdp_url}") + + # Handle host.docker.internal Host header issue + # Convert HTTP endpoint to WebSocket to bypass Chrome's Host header validation + endpoint_url = self.cdp_url + if endpoint_url.startswith("http://") and "host.docker.internal" in endpoint_url: + import re + port_match = re.search(r':(\d+)', endpoint_url) + if port_match: + port = port_match.group(1) + # Use WebSocket format - less strict Host header checking + endpoint_url = f"ws://host.docker.internal:{port}" + print(f" → Converted to WebSocket: {endpoint_url}") + + self.browser = await self.playwright.chromium.connect_over_cdp( + endpoint_url=endpoint_url, + timeout=30000 # 30 seconds + ) + # Use the default context from the connected browser + self.context = self.browser.contexts[0] if self.browser.contexts else await self.browser.new_context( + viewport={"width": 800, "height": 1600} + ) + # Use existing page or create new one + self.page = self.context.pages[0] if self.context.pages else await self.context.new_page() + # Navigate to start URL + await self.page.goto(self.start_url) + else: + # Launch browser with optional binary path (embedded browser mode) + launch_options = { + "headless": self.headless, + "args": ["--headless=new"] if self.headless else [] + } + if self.playwright_binary: + launch_options["executable_path"] = self.playwright_binary + + self.browser = await self.playwright.chromium.launch(**launch_options) + + # Create context with viewport size matching browser_agent + self.context = await self.browser.new_context( + viewport={"width": 800, "height": 1600} + ) + self.page = await self.context.new_page() + await self.page.goto(self.start_url) + + async def get_state(self, format: str = "hybrid") -> InterfaceState: + """Get current state of the web page.""" + if not self.page: + raise RuntimeError("Browser not initialized. Call initialize() first.") + + state = InterfaceState(url=self.page.url, title=await self.page.title()) + + if format in ["text", "hybrid"]: + # Get text content + state.content = await self.page.content() + + # Get interactive elements + elements = await self._get_interactive_elements() + state.interactive_elements = elements + + if format in ["visual", "hybrid"]: + # Get screenshot + state.screenshot = await self.get_screenshot() + + return state + + async def _get_interactive_elements(self) -> List[Dict[str, Any]]: + """Extract interactive elements from the page.""" + if not self.page: + return [] + + try: + elements = await self.page.evaluate( + """ + () => { + const interactiveSelectors = [ + 'button', 'a', 'input', 'select', 'textarea', + '[role="button"]', '[role="link"]', '[onclick]' + ]; + + const elements = []; + interactiveSelectors.forEach(selector => { + document.querySelectorAll(selector).forEach(el => { + if (el.offsetParent !== null) { // Is visible + elements.push({ + tag: el.tagName.toLowerCase(), + text: el.innerText || el.value || '', + type: el.type || '', + placeholder: el.placeholder || '', + href: el.href || '', + selector: el.id ? `#${el.id}` : + el.className ? `.${el.className.split(' ')[0]}` : + el.tagName.toLowerCase() + }); + } + }); + }); + return elements; + } + """ + ) + return elements + except Exception: + return [] + + async def execute_action(self, action: Action) -> ActionResult: + """Execute an action on the web page.""" + if not self.page: + raise RuntimeError("Browser not initialized. Call initialize() first.") + + try: + if action.action_type == ActionType.NAVIGATE: + if not action.value: + raise ValueError("Navigate action requires a URL value") + + # Try navigation with robust fallback strategy + try: + # First attempt: wait for networkidle (ideal but may timeout on slow sites) + await self.page.goto( + action.value, wait_until="networkidle", timeout=5000 + ) + result = ActionResult( + success=True, description=f"Navigated to {action.value}" + ) + except Exception as e: + # Fallback: if networkidle times out, check if page loaded at all + current_url = self.page.url + if current_url and ( + action.value in current_url or current_url != "about:blank" + ): + # Page loaded even if not fully idle - consider it a success + try: + # Wait a bit for DOM to be ready + await self.page.wait_for_load_state( + "domcontentloaded", timeout=5000 + ) + except: + pass + result = ActionResult( + success=True, + description=f"Navigated to {action.value} (page loaded but not fully idle)", + ) + else: + # Navigation truly failed + raise e + + elif action.action_type == ActionType.CLICK: + if not action.selector: + raise ValueError("Click action requires a selector") + + # Try different selector strategies with detailed error tracking + clicked = False + selector = action.selector + attempted_selectors = [] + last_error = None + + # Strategy 1: Direct CSS selector (wait for visibility first) + try: + # Wait for element to be visible before clicking + await self.page.wait_for_selector(selector, state="visible", timeout=3000) + await self.page.click(selector, timeout=2000) + clicked = True + except Exception as e: + attempted_selectors.append(f"CSS:{selector}") + last_error = str(e) + + # Strategy 2: If selector contains :contains(), extract and try text-based + if ":contains(" in selector and not clicked: + import re + match = re.search(r":contains\(['\"]?(.*?)['\"]?\)", selector) + if match: + text = match.group(1) + + # Try exact text match + try: + await self.page.wait_for_selector(f"text={text}", state="visible", timeout=2000) + await self.page.click(f"text={text}", timeout=2000) + clicked = True + selector = f"text={text}" + except Exception as e2: + attempted_selectors.append(f"text={text}") + last_error = str(e2) + + # Try partial text match + try: + await self.page.click(f"text=/.*{text}.*/i", timeout=2000) + clicked = True + selector = f"text=/.*{text}.*/i" + except Exception as e3: + attempted_selectors.append(f"text=/.*{text}.*/i") + last_error = str(e3) + + # Try href match for links + try: + link_selector = f"a[href*='{text.lower()}']" + await self.page.click(link_selector, timeout=2000) + clicked = True + selector = link_selector + except Exception as e4: + attempted_selectors.append(link_selector) + last_error = str(e4) + + # Strategy 3: If plain text (not CSS), try as text selector + if ( + not clicked + and not selector.startswith("#") + and not selector.startswith(".") + and not selector.startswith("[") + ): + # Try exact text + try: + await self.page.click(f"text={selector}", timeout=2000) + clicked = True + selector = f"text={selector}" + except Exception as e5: + attempted_selectors.append(f"text={selector}") + last_error = str(e5) + + # Try partial text match (case-insensitive) + try: + await self.page.click(f"text=/.*{selector}.*/i", timeout=2000) + clicked = True + selector = f"text=/.*{selector}.*/i" + except Exception as e6: + attempted_selectors.append(f"text=/.*{selector}.*/i") + last_error = str(e6) + + # Strategy 4: Force click if element is covered (e.g., by ads) + if not clicked: + try: + original_selector = action.selector + # Try to locate the element and force click + await self.page.click(original_selector, force=True, timeout=2000) + clicked = True + selector = f"{original_selector} (forced)" + except Exception as e7: + attempted_selectors.append(f"force:{original_selector}") + last_error = str(e7) + + if not clicked: + # Provide helpful error message with all attempted strategies + error_msg = f"Failed to click element. Attempted selectors: {', '.join(attempted_selectors)}. Last error: {last_error}" + raise Exception(error_msg) + + result = ActionResult( + success=True, description=f"Clicked on {selector}" + ) + + elif action.action_type == ActionType.TYPE: + if not action.selector or not action.value: + raise ValueError("Type action requires both selector and value") + # Wait for input to be visible before typing + await self.page.wait_for_selector(action.selector, state="visible", timeout=3000) + await self.page.fill(action.selector, action.value) + result = ActionResult( + success=True, + description=f"Typed '{action.value}' into {action.selector}", + ) + + elif action.action_type == ActionType.SELECT: + if not action.selector or not action.value: + raise ValueError("Select action requires both selector and value") + # Wait for select element to be visible + await self.page.wait_for_selector(action.selector, state="visible", timeout=3000) + await self.page.select_option(action.selector, action.value) + result = ActionResult( + success=True, + description=f"Selected '{action.value}' in {action.selector}", + ) + + elif action.action_type == ActionType.PRESS: + if not action.selector or not action.value: + raise ValueError("Press action requires both selector and value") + # Wait for element to be visible before pressing key + await self.page.wait_for_selector(action.selector, state="visible", timeout=3000) + await self.page.press(action.selector, action.value) + result = ActionResult( + success=True, + description=f"Pressed '{action.value}' on {action.selector}", + ) + + elif action.action_type == ActionType.SCROLL: + # Map direction to scroll values + direction = action.value or "down" + scroll_x, scroll_y = 0, 0 + + if direction == "down": + scroll_y = 500 + elif direction == "up": + scroll_y = -500 + elif direction == "right": + scroll_x = 500 + elif direction == "left": + scroll_x = -500 + else: + # If it's a number, use it directly for vertical scrolling + try: + scroll_y = int(direction) + except ValueError: + scroll_y = 500 # Default to scrolling down + + await self.page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})") + result = ActionResult( + success=True, + description=f"Scrolled {direction} by {abs(scroll_y or scroll_x)} pixels", + ) + + elif action.action_type == ActionType.HOVER: + if not action.selector: + raise ValueError("Hover action requires a selector") + # Wait for element to be visible before hovering + await self.page.wait_for_selector(action.selector, state="visible", timeout=3000) + await self.page.hover(action.selector) + result = ActionResult( + success=True, description=f"Hovered over {action.selector}" + ) + + elif action.action_type == ActionType.PAUSE_FOR_USER: + # Pause execution and wait for user interaction + # This is useful for CAPTCHAs, manual login, or other user interventions + wait_time = int(action.value) if action.value else 60 + message = action.metadata.get("message", "Pausing for user interaction...") + + # Check if VNC is available for user interaction + vnc_url = self.get_vnc_url(host="localhost", port=56080) + + if not vnc_url and self.headless: + # No VNC and headless - user has no way to interact + result = ActionResult( + success=False, + description="", + error="Cannot pause for user: browser is in headless mode and VNC is not available. Set headless=False or enable VNC when initializing the browser." + ) + else: + # VNC is available or browser is visible - user can interact + # Return immediately without blocking - let the agent handle the pause + print(f"\n{'='*60}") + print(f"BROWSER READY FOR USER INTERACTION: {message}") + print(f"Current URL: {self.page.url}") + if vnc_url: + print(f"VNC URL: {vnc_url}") + print(f"Browser control panel will open automatically in web UI") + else: + print(f"Browser window should be visible on your display") + print(f"Agent will wait up to {wait_time} seconds") + print(f"{'='*60}\n") + + result = ActionResult( + success=True, + description=f"Browser ready for user interaction. Agent will pause for up to {wait_time} seconds. Current page: {self.page.url}" + ) + + else: + result = ActionResult( + success=False, + description="", + error=f"Unsupported action type: {action.action_type}", + ) + + # Record action in history + self.action_history.append(action) + + # Add screenshot if requested + if action.metadata.get("capture_screenshot", False): + result.screenshot = await self.get_screenshot() + + return result + + except Exception as e: + return ActionResult(success=False, description="", error=str(e)) + + async def get_screenshot(self) -> bytes: + """Get screenshot of current page as PNG bytes.""" + if not self.page: + raise RuntimeError("Browser not initialized. Call initialize() first.") + + return await self.page.screenshot(type="png", full_page=False) + + async def get_screenshot_base64(self) -> str: + """Get screenshot of current page as base64 string for LLM context.""" + screenshot_bytes = await self.get_screenshot() + return base64.b64encode(screenshot_bytes).decode('utf-8') + + def get_vnc_url(self, host: str = "localhost", port: int = 6080) -> Optional[str]: + """ + Get the noVNC URL for manual browser control. + + Args: + host: Host where noVNC is accessible (default: localhost) + port: Port where noVNC is accessible (default: 6080) + + Returns: + noVNC URL if VNC is enabled, None otherwise + """ + if not self.use_vnc: + return None + + import os + # Check if VNC is ready + vnc_status_file = '/tmp/vnc/status' + if not os.path.exists(vnc_status_file): + return None + + try: + with open(vnc_status_file, 'r') as f: + status_lines = f.readlines() + status_dict = {} + for line in status_lines: + if '=' in line: + key, value = line.strip().split('=', 1) + status_dict[key] = value + + vnc_ready = status_dict.get('READY', 'false') == 'true' + if not vnc_ready: + return None + + novnc_port = status_dict.get('NOVNC_PORT', str(port)) + return f"http://{host}:{novnc_port}/vnc.html?autoconnect=true&resize=none" + except Exception: + return None + + async def close(self) -> None: + """Close browser and clean up.""" + if self.page: + await self.page.close() + if self.context: + await self.context.close() + if self.browser: + await self.browser.close() + if self.playwright: + await self.playwright.stop() + diff --git a/python/helpers/browser_use.py b/python/helpers/browser_use.py deleted file mode 100644 index 5c1800d2e4..0000000000 --- a/python/helpers/browser_use.py +++ /dev/null @@ -1,4 +0,0 @@ -from python.helpers import dotenv -dotenv.save_dotenv_value("ANONYMIZED_TELEMETRY", "false") -import browser_use -import browser_use.utils \ No newline at end of file diff --git a/python/helpers/browser_use_monkeypatch.py b/python/helpers/browser_use_monkeypatch.py deleted file mode 100644 index 8f77ca9e6b..0000000000 --- a/python/helpers/browser_use_monkeypatch.py +++ /dev/null @@ -1,162 +0,0 @@ -from typing import Any -from browser_use.llm import ChatGoogle -from python.helpers import dirty_json - - -# ------------------------------------------------------------------------------ -# Gemini Helper for Output Conformance -# ------------------------------------------------------------------------------ -# This function sanitizes and conforms the JSON output from Gemini to match -# the specific schema expectations of the browser-use library. It handles -# markdown fences, aliases actions (like 'complete_task' to 'done'), and -# intelligently constructs a valid 'data' object for the final action. - -def gemini_clean_and_conform(text: str): - obj = None - try: - # dirty_json parser is robust enough to handle markdown fences - obj = dirty_json.parse(text) - except Exception: - return None # return None if parsing fails - - if not isinstance(obj, dict): - return None - - # Conform actions to browser-use expectations - if isinstance(obj.get("action"), list): - normalized_actions = [] - for item in obj["action"]: - if not isinstance(item, dict): - continue # Skip non-dict items - - action_key, action_value = next(iter(item.items()), (None, None)) - if not action_key: - continue - - # Alias 'complete_task' to 'done' to handle inconsistencies - if action_key == "complete_task": - action_key = "done" - - # Create a mutable copy of the value - v = (action_value or {}).copy() - - if action_key in ("scroll_down", "scroll_up", "scroll"): - is_down = action_key != "scroll_up" - v.setdefault("down", is_down) - v.setdefault("num_pages", 1.0) - normalized_actions.append({"scroll": v}) - elif action_key == "go_to_url": - v.setdefault("new_tab", False) - normalized_actions.append({action_key: v}) - elif action_key == "done": - # If `data` is missing, construct it from other keys - if "data" not in v: - # Pop fields from the top-level `done` object - response_text = v.pop("response", None) - summary_text = v.pop("page_summary", None) - title_text = v.pop("title", "Task Completed") - - final_response = response_text or "Task completed successfully." # browser-use expects string - final_summary = summary_text or "No page summary available." # browser-use expects string - - v["data"] = { - "title": title_text, - "response": final_response, - "page_summary": final_summary, - } - - v.setdefault("success", True) - normalized_actions.append({action_key: v}) - else: - normalized_actions.append(item) - obj["action"] = normalized_actions - - return dirty_json.stringify(obj) - -# ------------------------------------------------------------------------------ -# Monkey-patch for browser-use Gemini schema issue -# ------------------------------------------------------------------------------ -# The original _fix_gemini_schema in browser_use.llm.google.chat.ChatGoogle -# removes the 'title' property but fails to remove it from the 'required' list, -# causing a validation error with the Gemini API. This patch corrects that behavior. - -def _patched_fix_gemini_schema(self, schema: dict[str, Any]) -> dict[str, Any]: - """ - Convert a Pydantic model to a Gemini-compatible schema. - - This function removes unsupported properties like 'additionalProperties' and resolves - $ref references that Gemini doesn't support. - """ - - # Handle $defs and $ref resolution - if '$defs' in schema: - defs = schema.pop('$defs') - - def resolve_refs(obj: Any) -> Any: - if isinstance(obj, dict): - if '$ref' in obj: - ref = obj.pop('$ref') - ref_name = ref.split('/')[-1] - if ref_name in defs: - # Replace the reference with the actual definition - resolved = defs[ref_name].copy() - # Merge any additional properties from the reference - for key, value in obj.items(): - if key != '$ref': - resolved[key] = value - return resolve_refs(resolved) - return obj - else: - # Recursively process all dictionary values - return {k: resolve_refs(v) for k, v in obj.items()} - elif isinstance(obj, list): - return [resolve_refs(item) for item in obj] - return obj - - schema = resolve_refs(schema) - - # Remove unsupported properties - def clean_schema(obj: Any) -> Any: - if isinstance(obj, dict): - # Remove unsupported properties - cleaned = {} - for key, value in obj.items(): - if key not in ['additionalProperties', 'title', 'default']: - cleaned_value = clean_schema(value) - # Handle empty object properties - Gemini doesn't allow empty OBJECT types - if ( - key == 'properties' - and isinstance(cleaned_value, dict) - and len(cleaned_value) == 0 - and isinstance(obj.get('type', ''), str) - and obj.get('type', '').upper() == 'OBJECT' - ): - # Convert empty object to have at least one property - cleaned['properties'] = {'_placeholder': {'type': 'string'}} - else: - cleaned[key] = cleaned_value - - # If this is an object type with empty properties, add a placeholder - if ( - isinstance(cleaned.get('type', ''), str) - and cleaned.get('type', '').upper() == 'OBJECT' - and 'properties' in cleaned - and isinstance(cleaned['properties'], dict) - and len(cleaned['properties']) == 0 - ): - cleaned['properties'] = {'_placeholder': {'type': 'string'}} - - # PATCH: Also remove 'title' from the required list if it exists - if 'required' in cleaned and isinstance(cleaned.get('required'), list): - cleaned['required'] = [p for p in cleaned['required'] if p != 'title'] - - return cleaned - elif isinstance(obj, list): - return [clean_schema(item) for item in obj] - return obj - - return clean_schema(schema) - -def apply(): - """Applies the monkey-patch to ChatGoogle.""" - ChatGoogle._fix_gemini_schema = _patched_fix_gemini_schema diff --git a/python/helpers/mcp_handler.py b/python/helpers/mcp_handler.py index 1a16acb49e..2c44ded817 100644 --- a/python/helpers/mcp_handler.py +++ b/python/helpers/mcp_handler.py @@ -1112,4 +1112,4 @@ def get_session_id(self) -> Optional[str]: """Get the current session ID if available (for streaming HTTP clients).""" if self.session_id_callback is not None: return self.session_id_callback() - return None + return None \ No newline at end of file diff --git a/python/helpers/mcp_server.py b/python/helpers/mcp_server.py index 4c080da69c..0cbce8e6e2 100644 --- a/python/helpers/mcp_server.py +++ b/python/helpers/mcp_server.py @@ -430,4 +430,4 @@ async def mcp_middleware(request: Request, call_next): status_code=403, detail="MCP server is disabled in settings." ) - return await call_next(request) + return await call_next(request) \ No newline at end of file diff --git a/python/helpers/playwright.py b/python/helpers/playwright.py index 34f851ab63..9ce743e37e 100644 --- a/python/helpers/playwright.py +++ b/python/helpers/playwright.py @@ -1,6 +1,8 @@ from pathlib import Path import subprocess +import sys +import platform from python.helpers import files @@ -8,24 +10,98 @@ # should work for both docker and local installation def get_playwright_binary(): + """Get the Playwright Chromium binary path. + + Looks for full Chromium browser first (supports both headless and visible mode), + falls back to headless shell if full browser not found. + + Platform-aware: Only searches for binaries matching the current OS to prevent + attempting to run wrong-platform binaries (e.g., macOS binary in Linux Docker). + """ pw_cache = Path(get_playwright_cache_dir()) - headless_shell = next(pw_cache.glob("chromium_headless_shell-*/chrome-*/headless_shell"), None) + + # Detect current platform + system = platform.system() + + # Search for platform-specific full Chromium browser (supports visible mode) + full_browser = None + if system == "Darwin": # macOS + full_browser = next(pw_cache.glob("chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium"), None) + elif system == "Linux": + full_browser = next(pw_cache.glob("chromium-*/chrome-linux/chrome"), None) + elif system == "Windows": + full_browser = next(pw_cache.glob("chromium-*/chrome-win/chrome.exe"), None) + + if full_browser: + return full_browser + + # Fallback to platform-specific headless shell (headless-only, can't show GUI) + headless_shell = None + if system == "Darwin": # macOS + headless_shell = next(pw_cache.glob("chromium_headless_shell-*/chrome-mac/headless_shell"), None) + elif system == "Linux": + headless_shell = next(pw_cache.glob("chromium_headless_shell-*/chrome-linux/headless_shell"), None) + elif system == "Windows": + headless_shell = next(pw_cache.glob("chromium_headless_shell-*/chrome-win/headless_shell.exe"), None) + return headless_shell def get_playwright_cache_dir(): return files.get_abs_path("tmp/playwright") def ensure_playwright_binary(): + """Ensure Playwright browser is installed. + + Installs full Chromium browser (supports both visible and headless modes). + Falls back to headless shell only if full browser installation fails. + + Cleans up wrong-platform binaries if found (e.g., macOS binary in Linux Docker). + """ + import os + import shutil + bin = get_playwright_binary() if not bin: cache = get_playwright_cache_dir() - import os + pw_cache = Path(cache) + + # Clean up wrong-platform binaries to avoid confusion and save space + system = platform.system() + wrong_platform_dirs = [] + + if system != "Darwin": # Not macOS - remove macOS binaries + wrong_platform_dirs.extend(pw_cache.glob("chromium-*/chrome-mac")) + if system != "Linux": # Not Linux - remove Linux binaries + wrong_platform_dirs.extend(pw_cache.glob("chromium-*/chrome-linux")) + if system != "Windows": # Not Windows - remove Windows binaries + wrong_platform_dirs.extend(pw_cache.glob("chromium-*/chrome-win")) + + for wrong_dir in wrong_platform_dirs: + print(f"Removing wrong-platform binary: {wrong_dir}") + # Remove the entire chromium-* directory, not just the platform subdirectory + chromium_dir = wrong_dir.parent + if chromium_dir.exists(): + shutil.rmtree(chromium_dir) + env = os.environ.copy() env["PLAYWRIGHT_BROWSERS_PATH"] = cache - subprocess.check_call( - ["playwright", "install", "chromium", "--only-shell"], - env=env - ) + + # Install full Chromium browser (supports both visible and headless modes) + print(f"Installing Playwright Chromium browser for {system} (supports visible mode)...") + try: + subprocess.check_call( + [sys.executable, "-m", "playwright", "install", "chromium"], + env=env + ) + except subprocess.CalledProcessError as e: + print(f"Failed to install full Chromium: {e}") + print("Falling back to headless shell (headless-only)...") + # Fallback: install headless shell only + subprocess.check_call( + [sys.executable, "-m", "playwright", "install", "chromium", "--only-shell"], + env=env + ) + bin = get_playwright_binary() if not bin: raise Exception("Playwright binary not found after installation") diff --git a/python/helpers/settings.py b/python/helpers/settings.py index d882de94c9..a7a790d436 100644 --- a/python/helpers/settings.py +++ b/python/helpers/settings.py @@ -46,14 +46,6 @@ class Settings(TypedDict): embed_model_rl_requests: int embed_model_rl_input: int - browser_model_provider: str - browser_model_name: str - browser_model_api_base: str - browser_model_vision: bool - browser_model_rl_requests: int - browser_model_rl_input: int - browser_model_rl_output: int - browser_model_kwargs: dict[str, Any] browser_http_headers: dict[str, Any] agent_profile: str @@ -429,106 +421,6 @@ def convert_out(settings: Settings) -> SettingsOutput: "tab": "agent", } - # embedding model section - browser_model_fields: list[SettingsField] = [] - browser_model_fields.append( - { - "id": "browser_model_provider", - "title": "Web Browser model provider", - "description": "Select provider for web browser model used by browser-use framework", - "type": "select", - "value": settings["browser_model_provider"], - "options": cast(list[FieldOption], get_providers("chat")), - } - ) - browser_model_fields.append( - { - "id": "browser_model_name", - "title": "Web Browser model name", - "description": "Exact name of model from selected provider", - "type": "text", - "value": settings["browser_model_name"], - } - ) - - browser_model_fields.append( - { - "id": "browser_model_api_base", - "title": "Web Browser model API base URL", - "description": "API base URL for web browser model. Leave empty for default. Only relevant for Azure, local and custom (other) providers.", - "type": "text", - "value": settings["browser_model_api_base"], - } - ) - - browser_model_fields.append( - { - "id": "browser_model_vision", - "title": "Use Vision", - "description": "Models capable of Vision can use it to analyze web pages from screenshots. Increases quality but also token usage.", - "type": "switch", - "value": settings["browser_model_vision"], - } - ) - - browser_model_fields.append( - { - "id": "browser_model_rl_requests", - "title": "Web Browser model rate limit requests", - "description": "Rate limit requests for web browser model.", - "type": "number", - "value": settings["browser_model_rl_requests"], - } - ) - - browser_model_fields.append( - { - "id": "browser_model_rl_input", - "title": "Web Browser model rate limit input", - "description": "Rate limit input for web browser model.", - "type": "number", - "value": settings["browser_model_rl_input"], - } - ) - - browser_model_fields.append( - { - "id": "browser_model_rl_output", - "title": "Web Browser model rate limit output", - "description": "Rate limit output for web browser model.", - "type": "number", - "value": settings["browser_model_rl_output"], - } - ) - - browser_model_fields.append( - { - "id": "browser_model_kwargs", - "title": "Web Browser model additional parameters", - "description": "Any other parameters supported by LiteLLM. Format is KEY=VALUE on individual lines, like .env file. Value can also contain JSON objects - when unquoted, it is treated as object, number etc., when quoted, it is treated as string.", - "type": "textarea", - "value": _dict_to_env(settings["browser_model_kwargs"]), - } - ) - - browser_model_fields.append( - { - "id": "browser_http_headers", - "title": "HTTP Headers", - "description": "HTTP headers to include with all browser requests. Format is KEY=VALUE on individual lines, like .env file. Value can also contain JSON objects - when unquoted, it is treated as object, number etc., when quoted, it is treated as string. Example: Authorization=Bearer token123", - "type": "textarea", - "value": _dict_to_env(settings.get("browser_http_headers", {})), - } - ) - - browser_model_section: SettingsSection = { - "id": "browser_model", - "title": "Web Browser Model", - "description": "Settings for the web browser model. Agent Zero uses browser-use agentic framework to handle web interactions.", - "fields": browser_model_fields, - "tab": "agent", - } - # basic auth section auth_fields: list[SettingsField] = [] @@ -1257,7 +1149,6 @@ def convert_out(settings: Settings) -> SettingsOutput: agent_section, chat_model_section, util_model_section, - browser_model_section, embed_model_section, memory_section, speech_section, @@ -1451,14 +1342,6 @@ def get_default_settings() -> Settings: embed_model_kwargs={}, embed_model_rl_requests=0, embed_model_rl_input=0, - browser_model_provider="openrouter", - browser_model_name="openai/gpt-4.1", - browser_model_api_base="", - browser_model_vision=True, - browser_model_rl_requests=0, - browser_model_rl_input=0, - browser_model_rl_output=0, - browser_model_kwargs={"temperature": "0"}, browser_http_headers={}, memory_recall_enabled=True, memory_recall_delayed=False, diff --git a/python/helpers/vector_db.py b/python/helpers/vector_db.py index 2b94960e31..c68c517d17 100644 --- a/python/helpers/vector_db.py +++ b/python/helpers/vector_db.py @@ -147,4 +147,4 @@ def comparator(data: dict[str, Any]): # PrintStyle.error(f"Error evaluating condition: {e}") return False - return comparator + return comparator \ No newline at end of file diff --git a/python/tools/browser_agent.py b/python/tools/browser_agent.py deleted file mode 100644 index 6d5f085b26..0000000000 --- a/python/tools/browser_agent.py +++ /dev/null @@ -1,428 +0,0 @@ -import asyncio -import time -from typing import Optional, cast -from agent import Agent, InterventionException -from pathlib import Path - -from python.helpers.tool import Tool, Response -from python.helpers import files, defer, persist_chat, strings -from python.helpers.browser_use import browser_use # type: ignore[attr-defined] -from python.helpers.print_style import PrintStyle -from python.helpers.playwright import ensure_playwright_binary -from python.helpers.secrets import get_secrets_manager -from python.extensions.message_loop_start._10_iteration_no import get_iter_no -from pydantic import BaseModel -import uuid -from python.helpers.dirty_json import DirtyJson - - -class State: - @staticmethod - async def create(agent: Agent): - state = State(agent) - return state - - def __init__(self, agent: Agent): - self.agent = agent - self.browser_session: Optional[browser_use.BrowserSession] = None - self.task: Optional[defer.DeferredTask] = None - self.use_agent: Optional[browser_use.Agent] = None - self.secrets_dict: Optional[dict[str, str]] = None - self.iter_no = 0 - - def __del__(self): - self.kill_task() - files.delete_dir(self.get_user_data_dir()) # cleanup user data dir - - def get_user_data_dir(self): - return str( - Path.home() - / ".config" - / "browseruse" - / "profiles" - / f"agent_{self.agent.context.id}" - ) - - async def _initialize(self): - if self.browser_session: - return - - # for some reason we need to provide exact path to headless shell, otherwise it looks for headed browser - pw_binary = ensure_playwright_binary() - - self.browser_session = browser_use.BrowserSession( - browser_profile=browser_use.BrowserProfile( - headless=True, - disable_security=True, - chromium_sandbox=False, - accept_downloads=True, - downloads_path=files.get_abs_path("tmp/downloads"), - allowed_domains=["*", "http://*", "https://*"], - executable_path=pw_binary, - keep_alive=True, - minimum_wait_page_load_time=1.0, - wait_for_network_idle_page_load_time=2.0, - maximum_wait_page_load_time=10.0, - window_size={"width": 1024, "height": 2048}, - screen={"width": 1024, "height": 2048}, - viewport={"width": 1024, "height": 2048}, - no_viewport=False, - args=["--headless=new"], - # Use a unique user data directory to avoid conflicts - user_data_dir=self.get_user_data_dir(), - extra_http_headers=self.agent.config.browser_http_headers or {}, - ) - ) - - await self.browser_session.start() if self.browser_session else None - # self.override_hooks() - - # -------------------------------------------------------------------------- - # Patch to enforce vertical viewport size - # -------------------------------------------------------------------------- - # Browser-use auto-configuration overrides viewport settings, causing wrong - # aspect ratio. We fix this by directly setting viewport size after startup. - # -------------------------------------------------------------------------- - - if self.browser_session: - try: - page = await self.browser_session.get_current_page() - if page: - await page.set_viewport_size({"width": 1024, "height": 2048}) - except Exception as e: - PrintStyle().warning(f"Could not force set viewport size: {e}") - - # -------------------------------------------------------------------------- - - # Add init script to the browser session - if self.browser_session and self.browser_session.browser_context: - js_override = files.get_abs_path("lib/browser/init_override.js") - await self.browser_session.browser_context.add_init_script(path=js_override) if self.browser_session else None - - def start_task(self, task: str): - if self.task and self.task.is_alive(): - self.kill_task() - - self.task = defer.DeferredTask( - thread_name="BrowserAgent" + self.agent.context.id - ) - if self.agent.context.task: - self.agent.context.task.add_child_task(self.task, terminate_thread=True) - self.task.start_task(self._run_task, task) if self.task else None - return self.task - - def kill_task(self): - if self.task: - self.task.kill(terminate_thread=True) - self.task = None - if self.browser_session: - try: - import asyncio - - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - loop.run_until_complete(self.browser_session.close()) if self.browser_session else None - loop.close() - except Exception as e: - PrintStyle().error(f"Error closing browser session: {e}") - finally: - self.browser_session = None - self.use_agent = None - self.iter_no = 0 - - async def _run_task(self, task: str): - await self._initialize() - - class DoneResult(BaseModel): - title: str - response: str - page_summary: str - - # Initialize controller - controller = browser_use.Controller(output_model=DoneResult) - - # Register custom completion action with proper ActionResult fields - @controller.registry.action("Complete task", param_model=DoneResult) - async def complete_task(params: DoneResult): - result = browser_use.ActionResult( - is_done=True, success=True, extracted_content=params.model_dump_json() - ) - return result - - model = self.agent.get_browser_model() - - try: - - secrets_manager = get_secrets_manager(self.agent.context) - secrets_dict = secrets_manager.load_secrets() - - self.use_agent = browser_use.Agent( - task=task, - browser_session=self.browser_session, - llm=model, - use_vision=self.agent.config.browser_model.vision, - extend_system_message=self.agent.read_prompt( - "prompts/browser_agent.system.md" - ), - controller=controller, - enable_memory=False, # Disable memory to avoid state conflicts - llm_timeout=3000, # TODO rem - sensitive_data=cast(dict[str, str | dict[str, str]] | None, secrets_dict or {}), # Pass secrets - ) - except Exception as e: - raise Exception( - f"Browser agent initialization failed. This might be due to model compatibility issues. Error: {e}" - ) from e - - self.iter_no = get_iter_no(self.agent) - - async def hook(agent: browser_use.Agent): - await self.agent.wait_if_paused() - if self.iter_no != get_iter_no(self.agent): - raise InterventionException("Task cancelled") - - # try: - result = None - if self.use_agent: - result = await self.use_agent.run( - max_steps=50, on_step_start=hook, on_step_end=hook - ) - return result - - async def get_page(self): - if self.use_agent and self.browser_session: - try: - return await self.use_agent.browser_session.get_current_page() if self.use_agent.browser_session else None - except Exception: - # Browser session might be closed or invalid - return None - return None - - async def get_selector_map(self): - """Get the selector map for the current page state.""" - if self.use_agent: - await self.use_agent.browser_session.get_state_summary(cache_clickable_elements_hashes=True) if self.use_agent.browser_session else None - return await self.use_agent.browser_session.get_selector_map() if self.use_agent.browser_session else None - await self.use_agent.browser_session.get_state_summary( - cache_clickable_elements_hashes=True - ) - return await self.use_agent.browser_session.get_selector_map() - return {} - - -class BrowserAgent(Tool): - - async def execute(self, message="", reset="", **kwargs): - self.guid = self.agent.context.generate_id() # short random id - reset = str(reset).lower().strip() == "true" - await self.prepare_state(reset=reset) - message = get_secrets_manager(self.agent.context).mask_values(message, placeholder="{key}") # mask any potential passwords passed from A0 to browser-use to browser-use format - task = self.state.start_task(message) if self.state else None - - # wait for browser agent to finish and update progress with timeout - timeout_seconds = 300 # 5 minute timeout - start_time = time.time() - - fail_counter = 0 - while not task.is_ready() if task else False: - # Check for timeout to prevent infinite waiting - if time.time() - start_time > timeout_seconds: - PrintStyle().warning( - self._mask(f"Browser agent task timeout after {timeout_seconds} seconds, forcing completion") - ) - break - - await self.agent.handle_intervention() - await asyncio.sleep(1) - try: - if task and task.is_ready(): # otherwise get_update hangs - break - try: - update = await asyncio.wait_for(self.get_update(), timeout=10) - fail_counter = 0 # reset on success - except asyncio.TimeoutError: - fail_counter += 1 - PrintStyle().warning( - self._mask(f"browser_agent.get_update timed out ({fail_counter}/3)") - ) - if fail_counter >= 3: - PrintStyle().warning( - self._mask("3 consecutive browser_agent.get_update timeouts, breaking loop") - ) - break - continue - update_log = update.get("log", get_use_agent_log(None)) - self.update_progress("\n".join(update_log)) - screenshot = update.get("screenshot", None) - if screenshot: - self.log.update(screenshot=screenshot) - except Exception as e: - PrintStyle().error(self._mask(f"Error getting update: {str(e)}")) - - if task and not task.is_ready(): - PrintStyle().warning(self._mask("browser_agent.get_update timed out, killing the task")) - self.state.kill_task() if self.state else None - return Response( - message=self._mask("Browser agent task timed out, not output provided."), - break_loop=False, - ) - - # final progress update - if self.state and self.state.use_agent: - log_final = get_use_agent_log(self.state.use_agent) - self.update_progress("\n".join(log_final)) - - # collect result with error handling - try: - result = await task.result() if task else None - except Exception as e: - PrintStyle().error(self._mask(f"Error getting browser agent task result: {str(e)}")) - # Return a timeout response if task.result() fails - answer_text = self._mask(f"Browser agent task failed to return result: {str(e)}") - self.log.update(answer=answer_text) - return Response(message=answer_text, break_loop=False) - # finally: - # # Stop any further browser access after task completion - # # self.state.kill_task() - # pass - - # Check if task completed successfully - if result and result.is_done(): - answer = result.final_result() - try: - if answer and isinstance(answer, str) and answer.strip(): - answer_data = DirtyJson.parse_string(answer) - answer_text = strings.dict_to_text(answer_data) # type: ignore - else: - answer_text = ( - str(answer) if answer else "Task completed successfully" - ) - except Exception as e: - answer_text = ( - str(answer) - if answer - else f"Task completed with parse error: {str(e)}" - ) - else: - # Task hit max_steps without calling done() - urls = result.urls() if result else [] - current_url = urls[-1] if urls else "unknown" - answer_text = ( - f"Task reached step limit without completion. Last page: {current_url}. " - f"The browser agent may need clearer instructions on when to finish." - ) - - # Mask answer for logs and response - answer_text = self._mask(answer_text) - - # update the log (without screenshot path here, user can click) - self.log.update(answer=answer_text) - - # add screenshot to the answer if we have it - if ( - self.log.kvps - and "screenshot" in self.log.kvps - and self.log.kvps["screenshot"] - ): - path = self.log.kvps["screenshot"].split("//", 1)[-1].split("&", 1)[0] - answer_text += f"\n\nScreenshot: {path}" - - # respond (with screenshot path) - return Response(message=answer_text, break_loop=False) - - def get_log_object(self): - return self.agent.context.log.log( - type="browser", - heading=f"icon://captive_portal {self.agent.agent_name}: Calling Browser Agent", - content="", - kvps=self.args, - ) - - async def get_update(self): - await self.prepare_state() - - result = {} - agent = self.agent - ua = self.state.use_agent if self.state else None - page = await self.state.get_page() if self.state else None - - if ua and page: - try: - - async def _get_update(): - - # await agent.wait_if_paused() # no need here - - # Build short activity log - result["log"] = get_use_agent_log(ua) - - path = files.get_abs_path( - persist_chat.get_chat_folder_path(agent.context.id), - "browser", - "screenshots", - f"{self.guid}.png", - ) - files.make_dirs(path) - await page.screenshot(path=path, full_page=False, timeout=3000) - result["screenshot"] = f"img://{path}&t={str(time.time())}" - - if self.state and self.state.task and not self.state.task.is_ready(): - await self.state.task.execute_inside(_get_update) - - except Exception: - pass - - return result - - async def prepare_state(self, reset=False): - self.state = self.agent.get_data("_browser_agent_state") - if reset and self.state: - self.state.kill_task() - if not self.state or reset: - self.state = await State.create(self.agent) - self.agent.set_data("_browser_agent_state", self.state) - - def update_progress(self, text): - text = self._mask(text) - short = text.split("\n")[-1] - if len(short) > 50: - short = short[:50] + "..." - progress = f"Browser: {short}" - - self.log.update(progress=text) - self.agent.context.log.set_progress(progress) - - def _mask(self, text: str) -> str: - try: - return get_secrets_manager(self.agent.context).mask_values(text or "") - except Exception as e: - return text or "" - - # def __del__(self): - # if self.state: - # self.state.kill_task() - - -def get_use_agent_log(use_agent: browser_use.Agent | None): - result = ["🚦 Starting task"] - if use_agent: - action_results = use_agent.history.action_results() or [] - short_log = [] - for item in action_results: - # final results - if item.is_done: - if item.success: - short_log.append("✅ Done") - else: - short_log.append( - f"❌ Error: {item.error or item.extracted_content or 'Unknown error'}" - ) - - # progress messages - else: - text = item.extracted_content - if text: - first_line = text.split("\n", 1)[0][:200] - short_log.append(first_line) - result.extend(short_log) - return result diff --git a/python/tools/browser_control.py b/python/tools/browser_control.py new file mode 100644 index 0000000000..98e5d7cf9b --- /dev/null +++ b/python/tools/browser_control.py @@ -0,0 +1,534 @@ +""" +Browser Control Tool - Granular browser control with individual actions. + +This tool provides precise browser automation through individual action methods +(navigate, click, type, scroll, observe_page, etc.) following Agent Zero's +tool-based architecture. +""" + +import asyncio +import time +from typing import Optional +from dataclasses import dataclass +from agent import Agent, InterventionException +from pathlib import Path + +from python.helpers.tool import Tool, Response +from python.helpers import files, persist_chat +from python.helpers.print_style import PrintStyle +from python.helpers.playwright import ensure_playwright_binary +from python.helpers.browser_control_client import ( + PlaywrightClient, + BrowserControlState, + Action, + ActionType, + ActionResult, +) + + +class BrowserControl(Tool): + """ + Browser Control tool for granular browser control. + + Provides individual action methods for precise web automation. + """ + + async def execute(self, **kwargs) -> Response: + """ + Execute browser control action based on method name. + + Routes to specific methods like navigate, click, type, etc. + """ + await self.agent.handle_intervention() + + # Generate unique GUID for screenshot naming + self.guid = self.agent.context.generate_id() + + method = self.method or "observe_page" + reset = str(kwargs.get("reset", "false")).lower() == "true" + + # Initialize/retrieve state + await self.prepare_state(reset=reset) + + # Route to specific method + result = None + try: + if method == "navigate": + result = await self._navigate(kwargs.get("url")) + elif method == "click": + result = await self._click(kwargs.get("selector")) + elif method == "type": + result = await self._type(kwargs.get("selector"), kwargs.get("text")) + elif method == "scroll": + result = await self._scroll(kwargs.get("direction", "down")) + elif method == "observe_page": + result = await self._observe_page() + elif method == "select": + result = await self._select(kwargs.get("selector"), kwargs.get("value")) + elif method == "press": + result = await self._press(kwargs.get("selector"), kwargs.get("key")) + elif method == "hover": + result = await self._hover(kwargs.get("selector")) + elif method == "pause_for_user": + result = await self._pause_for_user( + kwargs.get("wait_seconds", 60), + kwargs.get("message", "Pausing for user interaction...") + ) + elif method == "get_browser_info": + result = await self._get_browser_info() + else: + result = f"Unknown method: {method}. Available methods: navigate, click, type, scroll, observe_page, select, press, hover, pause_for_user, get_browser_info" + + # Capture screenshot after action (UI display) + await self._capture_screenshot(method) + + except Exception as e: + result = f"Error executing {method}: {str(e)}" + PrintStyle().error(result) + + if not result: + result = f"Method {method} completed but returned no output" + + return Response(message=result, break_loop=False) + + async def prepare_state(self, reset: bool = False): + """ + Initialize or retrieve Playwright state. + + Follows pattern from code_execution_tool state management. + """ + self.state: Optional[BrowserControlState] = self.agent.get_data("_browser_control_state") + + if reset and self.state and self.state.client: + # Close existing session + try: + await self.state.client.close() + except Exception as e: + PrintStyle().warning(f"Error closing existing session: {e}") + self.state = None + + if not self.state or not self.state.initialized: + # Create new Playwright session + try: + from playwright.async_api import async_playwright + except ImportError: + raise ImportError( + "Playwright is not installed. Install with: pip install playwright" + ) + + # Get Playwright binary path (only needed if not using CDP) + cdp_url = self.agent.config.browser_control_cdp_url + pw_binary = None if cdp_url else ensure_playwright_binary() + + # Check if VNC is available and enabled + import os + use_vnc = os.path.exists('/tmp/vnc/status') and not cdp_url + vnc_display = os.environ.get('VNC_DISPLAY', ':99') if use_vnc else None + + # Create client + client = PlaywrightClient( + start_url=self.agent.config.browser_control_start_url, + headless=self.agent.config.browser_control_headless, + playwright_binary=str(pw_binary) if pw_binary else None, + cdp_url=cdp_url if cdp_url else None, + use_vnc=use_vnc, + vnc_display=vnc_display + ) + + # Initialize browser + await client.initialize() + + # Create state + self.state = BrowserControlState( + playwright=client.playwright, + browser=client.browser, + context=client.context, + page=client.page, + client=client, + initialized=True + ) + + self.agent.set_data("_browser_control_state", self.state) + + return self.state + + async def _get_state(self) -> Optional[BrowserControlState]: + """Helper to get current state.""" + return self.agent.get_data("_browser_control_state") + + async def _navigate(self, url: Optional[str]) -> str: + """Navigate to a URL with fallback handling.""" + if not url: + return "Error: URL is required for navigate action" + + state = await self._get_state() + if not state or not state.client: + return "Error: Browser not initialized" + + action = Action(action_type=ActionType.NAVIGATE, value=url) + result = await state.client.execute_action(action) + + if result.success: + return result.description + else: + return f"Navigation failed: {result.error}" + + async def _click(self, selector: Optional[str]) -> str: + """Click element with selector strategies and text fallback.""" + if not selector: + return "Error: Selector is required for click action" + + state = await self._get_state() + if not state or not state.client: + return "Error: Browser not initialized" + + action = Action(action_type=ActionType.CLICK, selector=selector) + result = await state.client.execute_action(action) + + if result.success: + return result.description + else: + return f"Click failed: {result.error}. Try a different selector or text content." + + async def _type(self, selector: Optional[str], text: Optional[str]) -> str: + """Type text into input field.""" + if not selector: + return "Error: Selector is required for type action" + if not text: + return "Error: Text is required for type action" + + state = await self._get_state() + if not state or not state.client: + return "Error: Browser not initialized" + + action = Action(action_type=ActionType.TYPE, selector=selector, value=text) + result = await state.client.execute_action(action) + + if result.success: + return result.description + else: + return f"Type failed: {result.error}" + + async def _scroll(self, direction: str = "down") -> str: + """Scroll page up/down/left/right.""" + state = await self._get_state() + if not state or not state.client: + return "Error: Browser not initialized" + + action = Action(action_type=ActionType.SCROLL, value=direction) + result = await state.client.execute_action(action) + + if result.success: + return result.description + else: + return f"Scroll failed: {result.error}" + + async def _select(self, selector: Optional[str], value: Optional[str]) -> str: + """Select option from dropdown.""" + if not selector: + return "Error: Selector is required for select action" + if not value: + return "Error: Value is required for select action" + + state = await self._get_state() + if not state or not state.client: + return "Error: Browser not initialized" + + action = Action(action_type=ActionType.SELECT, selector=selector, value=value) + result = await state.client.execute_action(action) + + if result.success: + return result.description + else: + return f"Select failed: {result.error}" + + async def _press(self, selector: Optional[str], key: Optional[str]) -> str: + """Press keyboard key on element.""" + if not selector: + return "Error: Selector is required for press action" + if not key: + return "Error: Key is required for press action" + + state = await self._get_state() + if not state or not state.client: + return "Error: Browser not initialized" + + action = Action(action_type=ActionType.PRESS, selector=selector, value=key) + result = await state.client.execute_action(action) + + if result.success: + return result.description + else: + return f"Press failed: {result.error}" + + async def _hover(self, selector: Optional[str]) -> str: + """Hover over element.""" + if not selector: + return "Error: Selector is required for hover action" + + state = await self._get_state() + if not state or not state.client: + return "Error: Browser not initialized" + + action = Action(action_type=ActionType.HOVER, selector=selector) + result = await state.client.execute_action(action) + + if result.success: + return result.description + else: + return f"Hover failed: {result.error}" + + async def _observe_page(self) -> str: + """ + Extract page content and add screenshot to LLM context. + + This method provides semantic content extraction and adds + screenshot to agent history for vision model analysis. + """ + state = await self._get_state() + if not state or not state.client or not state.client.page: + return "Error: Browser not initialized" + + page = state.client.page + + # Build description of the page + try: + description = f"URL: {page.url}\n" + description += f"Title: {await page.title()}\n\n" + + # Extract semantic content (headings, articles, main content) + content_data = await page.evaluate( + """ + () => { + // Extract headings + const headings = Array.from(document.querySelectorAll('h1, h2, h3')) + .slice(0, 10) + .map(h => `${h.tagName}: ${h.innerText.trim()}`) + .filter(h => h.length > 5); + + // Extract article content or main content + let mainText = ''; + const article = document.querySelector('article, main, [role="main"]'); + if (article) { + mainText = article.innerText.substring(0, 5000); + } else { + mainText = document.body.innerText.substring(0, 5000); + } + + return { + headings: headings, + text: mainText + }; + } + """ + ) + + # Format the content + if content_data.get("headings"): + description += "Key headings:\n" + for heading in content_data["headings"][:8]: + description += f" {heading}\n" + description += "\n" + + if content_data.get("text"): + description += f"Page content:\n{content_data['text']}\n\n" + + # Get interactive elements + interface_state = await state.client.get_state("text") + if interface_state.interactive_elements: + description += f"Interactive elements: {len(interface_state.interactive_elements)} found\n" + description += "Key elements:\n" + # Filter for meaningful elements + meaningful_elements = [ + elem + for elem in interface_state.interactive_elements[:15] + if elem.get("text", "").strip() + and len(elem.get("text", "").strip()) > 2 + ] + for elem in meaningful_elements[:10]: + text = elem.get("text", "").strip()[:50] + tag = elem.get("tag", "") + if text: + description += f" - {tag}: {text}\n" + + # Add screenshot to agent history for vision model analysis + if self.agent.config.chat_model.vision: + try: + screenshot_b64 = await state.client.get_screenshot_base64() + # Add to history as multimodal content + self.agent.hist_add_message( + False, # Not user message + content={ + "role": "user", + "type": "image", + "image": screenshot_b64, + "description": f"Screenshot of current page: {page.url}" + } + ) + except Exception as e: + PrintStyle().warning(f"Could not add screenshot to context: {e}") + + return description + + except Exception as e: + return f"Error extracting page content: {str(e)}" + + async def _pause_for_user(self, wait_seconds: int = 60, message: str = "Pausing for user interaction...") -> str: + """ + Pause execution to allow user to manually interact with the browser. + + This is useful for: + - Solving CAPTCHAs + - Manual login when automation is blocked + - Accepting cookies/terms manually + - Any other manual intervention needed + + Args: + wait_seconds: How long to wait for user interaction (default 60 seconds) + message: Custom message to display to user + + Note: If VNC is enabled, a URL will be provided for browser access. + """ + state = await self._get_state() + if not state or not state.client: + return "Error: Browser not initialized" + + # Check for VNC URL first + vnc_url = state.client.get_vnc_url(host="localhost", port=56080) + + # Build initial message with VNC URL + initial_message = f"⏸️ **Browser Pause Requested**\n\n{message}\n\n" + + if vnc_url: + initial_message += f"🌐 **Control Browser**: {vnc_url}\n\n" + initial_message += "Click the link above to access the browser and complete the manual task.\n" + initial_message += f"⏱️ Waiting up to {wait_seconds} seconds for you to complete the task...\n\n" + initial_message += "The browser control panel should open automatically in the web interface." + else: + initial_message += "⚠️ VNC is not available. Browser should be visible on your display.\n" + initial_message += f"⏱️ Waiting up to {wait_seconds} seconds..." + + # Update log with initial message so frontend can show browser panel + self.log.update(message=initial_message) + + # Call the client to mark the pause (returns immediately now) + action = Action( + action_type=ActionType.PAUSE_FOR_USER, + value=str(wait_seconds), + metadata={"message": message} + ) + result = await state.client.execute_action(action) + + if not result.success: + return f"Pause failed: {result.error}" + + # Now actually pause/wait at the Agent level + import asyncio + PrintStyle().info(f"Browser paused for user interaction. Waiting {wait_seconds} seconds...") + + try: + await asyncio.sleep(wait_seconds) + completion_message = f"✅ Browser pause completed. Resuming agent execution.\n\nCurrent page: {state.client.page.url}" + except asyncio.CancelledError: + completion_message = "Browser pause interrupted. Resuming agent execution." + + return completion_message + + async def _get_browser_info(self) -> str: + """ + Get diagnostic information about the browser configuration. + + Returns current browser state including visibility mode, + configuration settings, and helpful troubleshooting info. + """ + state = await self._get_state() + + info = [] + info.append("=== Browser Configuration ===") + info.append(f"Config headless mode: {self.agent.config.browser_control_headless}") + info.append(f"Config start URL: {self.agent.config.browser_control_start_url}") + info.append(f"Config timeout: {self.agent.config.browser_control_timeout}ms") + info.append("") + + if state and state.client: + info.append("=== Browser State ===") + info.append(f"Browser initialized: Yes") + info.append(f"Browser headless mode: {state.client.headless}") + if state.client.page: + info.append(f"Current URL: {state.client.page.url}") + info.append(f"Page title: {await state.client.page.title()}") + info.append("") + + # Provide helpful tips + info.append("=== Visibility Status ===") + if state.client.headless: + info.append("⚠️ Browser is running in HEADLESS mode (invisible)") + info.append("") + info.append("To see the browser window:") + info.append("1. Close current browser session with reset=true") + info.append("2. Set browser_control_headless=False in agent.py") + info.append("3. Restart the agent") + info.append("4. Or use: browser_control:navigate with reset='true'") + else: + info.append("✓ Browser is running in VISIBLE mode") + info.append(" A browser window should be visible on your screen") + info.append("") + info.append("If you don't see the window:") + info.append("- Check if it opened on another desktop/display") + info.append("- Look for Chrome in your taskbar/dock") + info.append("- Try alt-tabbing (Windows) or Command-Tab (Mac)") + info.append("- The window may be minimized or behind other windows") + else: + info.append("=== Browser State ===") + info.append("Browser not initialized yet") + info.append("First use of browser_control will initialize the browser") + + return "\n".join(info) + + async def _capture_screenshot(self, method: str): + """ + Capture screenshot after action for UI display. + + """ + try: + state = await self._get_state() + if not state or not state.client or not state.client.page: + return + + # Create screenshot directory + screenshot_path = files.get_abs_path( + persist_chat.get_chat_folder_path(self.agent.context.id), + "browser_control", + "screenshots", + f"{self.guid}.png" + ) + files.make_dirs(screenshot_path) + + # Save screenshot to file (viewport only, not full page) + await state.client.page.screenshot( + path=screenshot_path, + full_page=False, + timeout=self.agent.config.browser_control_timeout + ) + + # Update log with img:// protocol for UI display + screenshot_url = f"img://{screenshot_path}&t={str(time.time())}" + self.log.update(screenshot=screenshot_url) + + except Exception as e: + # Don't fail the tool execution if screenshot capture fails + PrintStyle().warning(f"Could not capture screenshot: {e}") + + def get_log_object(self): + """Override logging method to provide custom heading.""" + if self.method: + heading = f"icon://web {self.agent.agent_name}: Using browser_control:{self.method}" + else: + heading = f"icon://web {self.agent.agent_name}: Using browser_control" + return self.agent.context.log.log( + type="tool", + heading=heading, + content="", + kvps=self.args + ) + diff --git a/requirements.txt b/requirements.txt index f0391d266a..a9495ab8da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ a2wsgi==1.10.8 ansio==0.0.1 -browser-use==0.5.11 docker==7.1.0 duckduckgo-search==6.1.12 faiss-cpu==1.11.0 @@ -19,7 +18,7 @@ langchain-unstructured[all-docs]==0.1.6 openai-whisper==20240930 lxml_html_clean==0.3.1 markdown==3.7 -mcp==1.13.1 +mcp>=1.13.1 newspaper3k==0.2.8 paramiko==3.5.0 playwright==1.52.0 diff --git a/webui/components/browser-control/browser-control-icons.html b/webui/components/browser-control/browser-control-icons.html new file mode 100644 index 0000000000..5069a902c2 --- /dev/null +++ b/webui/components/browser-control/browser-control-icons.html @@ -0,0 +1,44 @@ + + + Browser Control Toggle Icon + + + +
+ +
+ + + + diff --git a/webui/components/browser-control/browser-control-store.js b/webui/components/browser-control/browser-control-store.js new file mode 100644 index 0000000000..0c80107a9f --- /dev/null +++ b/webui/components/browser-control/browser-control-store.js @@ -0,0 +1,65 @@ +import { createStore } from "/js/AlpineStore.js"; + +const model = { + isVisible: false, + isMinimized: false, + isMaximized: false, + vncUrl: '', + vncReady: false, + _checkInterval: null, + + init() { + this.checkVncAvailability(); + // Poll for VNC availability every 3 seconds + this._checkInterval = setInterval(() => this.checkVncAvailability(), 3000); + }, + + async checkVncAvailability() { + try { + const response = await fetch('/browser_control?action=info'); + const data = await response.json(); + this.vncReady = data.vnc_ready; + + if (data.vnc_ready && data.novnc_url) { + this.vncUrl = data.novnc_url; + } + } catch (error) { + console.log('VNC not available:', error); + this.vncReady = false; + } + }, + + show(url = null) { + if (url) { + this.vncUrl = url; + } + this.isVisible = true; + this.isMinimized = false; + }, + + hide() { + this.isVisible = false; + }, + + cleanup() { + if (this._checkInterval) { + clearInterval(this._checkInterval); + this._checkInterval = null; + } + }, + + toggleMinimize() { + this.isMinimized = !this.isMinimized; + if (this.isMinimized) { + this.isMaximized = false; + } + }, + + toggleMaximize() { + this.isMaximized = !this.isMaximized; + } +}; + +// Create and export the store +const store = createStore("browserControl", model); +export { store }; diff --git a/webui/components/browser-control/browser-panel.html b/webui/components/browser-control/browser-panel.html new file mode 100644 index 0000000000..0d05e30fd6 --- /dev/null +++ b/webui/components/browser-control/browser-panel.html @@ -0,0 +1,373 @@ + + + Browser Control Panel + + + +
+ +
+ + + + + + + + diff --git a/webui/components/chat/top-section/chat-top-store.js b/webui/components/chat/top-section/chat-top-store.js index 8dd47b000d..7e0752c8eb 100644 --- a/webui/components/chat/top-section/chat-top-store.js +++ b/webui/components/chat/top-section/chat-top-store.js @@ -2,7 +2,8 @@ import { createStore } from "/js/AlpineStore.js"; // define the model object holding data and functions const model = { - connected: false, + connected: false, // Shows whether agent is actively processing (green when true) + backendAlive: true, // Tracks backend connection health }; // convert it to alpine store diff --git a/webui/components/chat/top-section/chat-top.html b/webui/components/chat/top-section/chat-top.html index 179495d6cc..e766146533 100644 --- a/webui/components/chat/top-section/chat-top.html +++ b/webui/components/chat/top-section/chat-top.html @@ -20,17 +20,19 @@
- + - - + +
+ + diff --git a/webui/index.html b/webui/index.html index 7b62b4f2bf..d2252f3a49 100644 --- a/webui/index.html +++ b/webui/index.html @@ -99,6 +99,9 @@ + + +