From 070225bcf12506710679898ef12358bf5b6ccdea Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Tue, 27 Jan 2026 17:01:35 -0500 Subject: [PATCH] feat(waa): refactor CLI and fix Python 3.9 compatibility - Refactor CLI from 6800 to ~1300 lines with flat command structure - Add analyze command to parse and summarize benchmark results - Add --num-tasks flag to limit number of tasks to run - Fix Python 3.9 compatibility by copying Python from vanilla WAA image (fixes transformers 4.46.2 compatibility with GroundingDINO) - Add coverage and analysis artifacts to .gitignore Co-Authored-By: Claude Opus 4.5 --- .gitignore | 10 + openadapt_ml/benchmarks/cli.py | 7823 +++-------------- openadapt_ml/benchmarks/waa_deploy/Dockerfile | 65 +- 3 files changed, 1465 insertions(+), 6433 deletions(-) diff --git a/.gitignore b/.gitignore index a613b58..1f4a400 100644 --- a/.gitignore +++ b/.gitignore @@ -59,6 +59,16 @@ demos/ # Pytest cache .pytest_cache/ +# Coverage files +.coverage +.coverage.* +htmlcov/ + +# Analysis/debug artifacts +cli_analysis.json +dead_code_candidates.json +segmentation_output/ + # Internal documentation (not for public repo) docs/internal/ docs/private/ diff --git a/openadapt_ml/benchmarks/cli.py b/openadapt_ml/benchmarks/cli.py index daea490..d16f231 100644 --- a/openadapt_ml/benchmarks/cli.py +++ b/openadapt_ml/benchmarks/cli.py @@ -1,6733 +1,1760 @@ -"""CLI for WAA benchmark evaluation. - -Usage: - # ============================================ - # WAA (Vanilla + Automated) - # ============================================ - - # Check for WAA repo + setup.iso + config.json - ./scripts/waa_bootstrap_helper.sh --clone - - # Prepare Windows 11 golden image (one-time, ~20 min) - ./scripts/waa_bootstrap_local.sh --iso-path /path/to/Windows11_Enterprise_Eval.iso - - # Run vanilla WAA benchmarks - cd /path/to/WindowsAgentArena/scripts - ./run-local.sh - - # Check VM status - python -m openadapt_ml.benchmarks.cli vm status - - # SSH into VM for manual control - python -m openadapt_ml.benchmarks.cli vm ssh - - # Clean up when done - python -m openadapt_ml.benchmarks.cli vm delete - - # ============================================ - # Benchmark Viewer (for monitoring running benchmarks) - # ============================================ - - # Launch viewer for an already-running VM - python -m openadapt_ml.benchmarks.cli viewer --vm-ip 172.171.112.41 - - # Launch on specific port without auto-opening browser - python -m openadapt_ml.benchmarks.cli viewer --vm-ip 172.171.112.41 --port 9000 --no-open - - # ============================================ - # Analyze Results - # ============================================ - - # Analyze results on remote VM (fast, no download) - python -m openadapt_ml.benchmarks.cli analyze --vm-ip --remote - - # Analyze with verbose output (shows task IDs) - python -m openadapt_ml.benchmarks.cli analyze --vm-ip --remote --verbose - - # Save analysis to JSON - python -m openadapt_ml.benchmarks.cli analyze --vm-ip --remote --output results.json - - # Analyze local results directory - python -m openadapt_ml.benchmarks.cli analyze --results-dir /path/to/results - - # ============================================ - # Mock/Testing (no Windows required) - # ============================================ - - # Test with mock adapter - python -m openadapt_ml.benchmarks.cli test-mock --tasks 20 - - # Test data collection (with screenshots and execution traces) - python -m openadapt_ml.benchmarks.cli test-collection --tasks 5 - - # ============================================ - # API-backed evaluation (Claude/GPT baselines) - # ============================================ +#!/usr/bin/env python3 +""" +WAA Benchmark CLI - Windows Agent Arena evaluation toolkit - python -m openadapt_ml.benchmarks.cli run-api --provider anthropic --tasks 5 - python -m openadapt_ml.benchmarks.cli run-api --provider openai --tasks 5 +Uses custom waa_deploy/Dockerfile with dockurr/windows:latest base and +Python 3.9 from vanilla windowsarena/winarena for GroundingDINO compatibility. - # ============================================ - # Azure ML (Note: doesn't support nested virt) - # ============================================ +See waa_deploy/Dockerfile for details. - python -m openadapt_ml.benchmarks.cli estimate --workers 40 - python -m openadapt_ml.benchmarks.cli run-azure --config azure_config.json --workers 40 +Usage: + uv run python -m openadapt_ml.benchmarks.cli [options] + +Commands: + create Create Azure VM with nested virtualization + delete Delete VM and ALL associated resources + status Show VM state and IP + build Build WAA image from waa_deploy/Dockerfile + start Start WAA container (Windows boots + WAA server) + probe Check if WAA server is ready + run Run benchmark tasks + deallocate Stop VM (preserves disk, stops billing) + logs Show WAA status and logs + +Workflow: + 1. create - Create Azure VM (~5 min) + 2. build - Build custom WAA image (~10 min) + 3. start - Start container, Windows downloads+boots (~15-20 min first time) + 4. probe --wait - Wait for WAA server + 5. run - Run benchmark + 6. deallocate - Stop billing """ -from __future__ import annotations - import argparse -import os import json -import logging +import subprocess import sys import time -import warnings +from datetime import datetime from pathlib import Path - -from openadapt_ml.config import settings - -logger = logging.getLogger(__name__) - -# Pre-configure loggers to be quiet by default (before any Azure imports) -logging.getLogger("azure").setLevel(logging.WARNING) -logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel( - logging.WARNING -) -logging.getLogger("azure.ai.ml").setLevel(logging.WARNING) -logging.getLogger("urllib3").setLevel(logging.WARNING) -logging.getLogger("msrest").setLevel(logging.WARNING) -logging.getLogger("openadapt_ml.benchmarks.azure").setLevel(logging.WARNING) - -# Suppress Azure SDK experimental class warnings -warnings.filterwarnings("ignore", message=".*experimental class.*") - -# SSH options to handle host key changes when VMs are recreated -# StrictHostKeyChecking=no: Accept new host keys automatically -# UserKnownHostsFile=/dev/null: Don't save/check known_hosts (avoids conflicts) -# ServerAliveInterval=60: Send keepalive every 60 seconds to prevent timeout -# ServerAliveCountMax=10: Disconnect after 10 missed keepalives (10 min tolerance) -# TCPKeepAlive=yes: Enable TCP-level keepalive as additional safeguard -# ConnectTimeout=15: Fail fast on connection issues (default is system TCP timeout ~2min) +from typing import Optional + +# ============================================================================= +# Constants (single source of truth) +# ============================================================================= + +VM_SIZE = "Standard_D4ds_v4" +VM_REGIONS = ["centralus", "eastus", "westus2", "eastus2"] +VM_NAME = "waa-eval-vm" +RESOURCE_GROUP = "openadapt-agents" +# Custom image built from waa_deploy/Dockerfile +# Uses dockurr/windows:latest (proper ISO download) + WAA components +DOCKER_IMAGE = "waa-auto:latest" +LOG_DIR = Path.home() / ".openadapt" / "waa" SSH_OPTS = [ "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-o", - "ServerAliveInterval=60", + "LogLevel=ERROR", "-o", - "ServerAliveCountMax=10", - "-o", - "TCPKeepAlive=yes", - "-o", - "ConnectTimeout=15", + "ConnectTimeout=10", ] +# Dockerfile location (relative to this file) +DOCKERFILE_PATH = Path(__file__).parent / "waa_deploy" / "Dockerfile" -def ssh_cmd(ip: str, cmd: str, extra_opts: list[str] | None = None) -> list[str]: - """Build SSH command with proper options for Azure VMs. - - Args: - ip: IP address of the VM - cmd: Command to run on the VM - extra_opts: Additional SSH options (e.g., ["-o", "ConnectTimeout=10"]) +# ============================================================================= +# Logging +# ============================================================================= - Returns: - Complete SSH command as a list for subprocess - """ - base = ["ssh", *SSH_OPTS] - if extra_opts: - base.extend(extra_opts) - base.append(f"azureuser@{ip}") - base.append(cmd) - return base +_log_file: Optional[Path] = None +_session_id: Optional[str] = None -def scp_cmd(src: str, dest: str, recursive: bool = False) -> list[str]: - """Build SCP command with proper options for Azure VMs. +def init_logging() -> Path: + """Initialize logging for this session.""" + global _log_file, _session_id - Args: - src: Source path (local or remote user@host:path) - dest: Destination path (local or remote user@host:path) - recursive: Whether to copy directories recursively + LOG_DIR.mkdir(parents=True, exist_ok=True) - Returns: - Complete SCP command as a list for subprocess - """ - base = ["scp", *SSH_OPTS] - if recursive: - base.append("-r") - base.extend([src, dest]) - return base + # Create session ID + _session_id = datetime.now().strftime("%Y-%m-%d_%H%M%S") + session_dir = LOG_DIR / "sessions" / _session_id + session_dir.mkdir(parents=True, exist_ok=True) + # Session log file + _log_file = session_dir / "full.log" -def check_vm_running(resource_group: str, vm_name: str) -> tuple[bool, str]: - """Check if an Azure VM is in running state. + # Update current session pointer + (LOG_DIR / "session_id.txt").write_text(_session_id) - Args: - resource_group: Azure resource group name - vm_name: Name of the VM + # Symlink for easy access + current_link = LOG_DIR / "current" + if current_link.exists() or current_link.is_symlink(): + current_link.unlink() + current_link.symlink_to(session_dir) - Returns: - Tuple of (is_running, power_state) - """ - import subprocess + return _log_file - result = subprocess.run( - [ - "az", "vm", "show", "-d", - "-g", resource_group, - "-n", vm_name, - "--query", "powerState", - "-o", "tsv", - ], - capture_output=True, - text=True, - timeout=30, - ) - if result.returncode != 0: - return False, "not_found" - power_state = result.stdout.strip() - return "running" in power_state.lower(), power_state +def log(step: str, message: str, end: str = "\n"): + """Log message to file and stdout.""" + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + formatted = f"[{timestamp}] [{step}] {message}" -def run_ssh_with_retry( - ip: str, - cmd: str, - max_retries: int = 3, - initial_delay: float = 2.0, - verbose: bool = False, -) -> subprocess.CompletedProcess: - """Run SSH command with retry logic and exponential backoff. + # Print to stdout + print(formatted, end=end, flush=True) - Args: - ip: IP address of the VM - cmd: Command to run on the VM - max_retries: Maximum number of retry attempts (default 3) - initial_delay: Initial delay between retries in seconds (default 2.0) - verbose: If True, print retry messages + # Write to log file + if _log_file: + with open(_log_file, "a") as f: + f.write(formatted + end) - Returns: - subprocess.CompletedProcess from the successful attempt - Raises: - subprocess.SubprocessError: If all retries fail - """ - import subprocess - import time +def log_stream(step: str, process: subprocess.Popen): + """Stream process output to log and stdout.""" + if process.stdout: + for line in iter(process.stdout.readline, ""): + if line: + log(step, line.rstrip()) - last_error = None - for attempt in range(max_retries + 1): - try: - result = subprocess.run( - ssh_cmd(ip, cmd), - capture_output=True, - text=True, - timeout=60, - ) - # SSH succeeded (even if remote command failed) - return result - except subprocess.TimeoutExpired as e: - last_error = e - if verbose: - print(f" SSH timeout (attempt {attempt + 1}/{max_retries + 1})") - except Exception as e: - last_error = e - if verbose: - print(f" SSH error (attempt {attempt + 1}/{max_retries + 1}): {e}") - - # Don't sleep after last attempt - if attempt < max_retries: - delay = initial_delay * (2 ** attempt) # Exponential backoff - if verbose: - print(f" Retrying in {delay:.1f}s...") - time.sleep(delay) - - # All retries exhausted - raise subprocess.SubprocessError( - f"SSH to {ip} failed after {max_retries + 1} attempts: {last_error}" - ) +# ============================================================================= +# Azure Helpers +# ============================================================================= -def setup_logging(verbose: bool = False) -> None: - """Configure logging with appropriate verbosity. - Args: - verbose: If True, show all logs. If False, suppress Azure SDK noise. - """ - level = logging.DEBUG if verbose else logging.INFO - logging.basicConfig( - level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +def get_vm_ip() -> Optional[str]: + """Get VM public IP if it exists.""" + result = subprocess.run( + [ + "az", + "vm", + "show", + "-d", + "-g", + RESOURCE_GROUP, + "-n", + VM_NAME, + "--query", + "publicIps", + "-o", + "tsv", + ], + capture_output=True, + text=True, ) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + return None - # Suppress noisy Azure SDK logs unless verbose - if not verbose: - logging.getLogger("azure").setLevel(logging.WARNING) - logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel( - logging.WARNING - ) - logging.getLogger("urllib3").setLevel(logging.WARNING) - logging.getLogger("msrest").setLevel(logging.WARNING) +def get_vm_state() -> Optional[str]: + """Get VM power state.""" + result = subprocess.run( + [ + "az", + "vm", + "get-instance-view", + "-g", + RESOURCE_GROUP, + "-n", + VM_NAME, + "--query", + "instanceView.statuses[1].displayStatus", + "-o", + "tsv", + ], + capture_output=True, + text=True, + ) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + return None -def bypass_product_key_dialog(ip: str, max_attempts: int = 3) -> bool: - """Send keyboard commands via QEMU monitor to skip the product key dialog. - Windows 11 Evaluation ISOs require clicking "I don't have a product key". - This function sends Tab + Enter keys via QEMU monitor to click that link. +def ssh_run( + ip: str, cmd: str, stream: bool = False, step: str = "SSH" +) -> subprocess.CompletedProcess: + """Run command on VM via SSH. - Args: - ip: IP address of the Azure VM running the container. - max_attempts: Number of times to try clicking (in case of timing issues). + When stream=True: + 1. Runs command on VM with output redirected to a persistent log file + 2. Streams that log file locally in real-time + 3. Log file persists on VM even if connection breaks - Returns: - True if commands were sent successfully. + Remote logs are stored at: /home/azureuser/cli_logs/{step}.log """ - import subprocess - import time + if stream: + # Remote log directory and file (persistent across sessions) + remote_log_dir = "/home/azureuser/cli_logs" + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + remote_log = f"{remote_log_dir}/{step.lower()}_{timestamp}.log" + + # Ensure log directory exists + subprocess.run( + ["ssh", *SSH_OPTS, f"azureuser@{ip}", f"mkdir -p {remote_log_dir}"], + capture_output=True, + ) - # QEMU sendkey commands to navigate to "I don't have a product key" link - # The link is at the bottom of the dialog - Tab navigates through UI elements - # We need to Tab to the link and press Enter + log(step, f"Remote log: {remote_log}") - for attempt in range(max_attempts): - try: - # Send commands via QEMU monitor (port 7100 in container) - ssh_cmd = """ -# Use telnet to send QEMU commands -( -echo "sendkey tab" -sleep 0.3 -echo "sendkey tab" -sleep 0.3 -echo "sendkey tab" -sleep 0.3 -echo "sendkey tab" -sleep 0.3 -echo "sendkey ret" -sleep 0.5 -) | timeout 10 docker exec -i winarena nc localhost 7100 2>/dev/null + # Run command with output to log file, capturing exit code + # Using script to capture terminal output including \r progress updates + # The command runs in foreground but output goes to file AND stdout + wrapped_cmd = f""" +set -o pipefail +{{ + {cmd} + echo $? > {remote_log}.exit +}} 2>&1 | tee {remote_log} """ - result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - "-o", - "ConnectTimeout=10", - f"azureuser@{ip}", - ssh_cmd, - ], - capture_output=True, - text=True, - timeout=30, - ) + full_cmd = ["ssh", *SSH_OPTS, f"azureuser@{ip}", wrapped_cmd] - if "QEMU" in result.stdout or result.returncode == 0: - return True - - time.sleep(2) + process = subprocess.Popen( + full_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) - except (subprocess.TimeoutExpired, Exception) as e: - logger.debug(f"Attempt {attempt + 1} failed: {e}") - time.sleep(2) + # Stream output to local log + try: + for line in iter(process.stdout.readline, ""): + if line: + # Handle carriage returns (Docker progress) + clean_line = line.rstrip() + if "\r" in clean_line: + # Take the last part after \r + parts = clean_line.split("\r") + clean_line = parts[-1].strip() + if clean_line: + log(step, clean_line) + process.wait() + except KeyboardInterrupt: + log(step, "Interrupted - command continues on VM") + log(step, f"View full log: ssh azureuser@{ip} 'cat {remote_log}'") + process.terminate() + return subprocess.CompletedProcess(cmd, 130, "", "") - return False + # Get exit code + result = subprocess.run( + [ + "ssh", + *SSH_OPTS, + f"azureuser@{ip}", + f"cat {remote_log}.exit 2>/dev/null || echo 1", + ], + capture_output=True, + text=True, + ) + exit_code = int(result.stdout.strip()) if result.stdout.strip().isdigit() else 1 + if exit_code != 0: + log(step, f"Command failed (exit {exit_code})") + log(step, f"Full log: ssh azureuser@{ip} 'cat {remote_log}'") -def find_waa_path() -> Path | None: - """Auto-detect Windows Agent Arena repository path. + return subprocess.CompletedProcess(cmd, exit_code, "", "") + else: + full_cmd = ["ssh", *SSH_OPTS, f"azureuser@{ip}", cmd] + return subprocess.run(full_cmd, capture_output=True, text=True) - Searches in order: - 1. vendor/WindowsAgentArena (git submodule) - 2. ../WindowsAgentArena (sibling directory) - 3. ~/WindowsAgentArena (home directory) - Returns: - Path to WAA repo, or None if not found. - """ - # Get the project root (where this package is installed) - project_root = Path(__file__).parent.parent.parent +def wait_for_ssh(ip: str, timeout: int = 120) -> bool: + """Wait for SSH to become available.""" + start = time.time() + while time.time() - start < timeout: + result = subprocess.run( + ["ssh", *SSH_OPTS, f"azureuser@{ip}", "echo ok"], + capture_output=True, + text=True, + timeout=15, + ) + if result.returncode == 0: + return True + time.sleep(5) + return False - candidates = [ - project_root / "vendor" / "WindowsAgentArena", - project_root.parent / "WindowsAgentArena", - Path.home() / "WindowsAgentArena", - ] - for path in candidates: - if path.exists() and (path / "src").exists(): - return path +# ============================================================================= +# Commands +# ============================================================================= - return None +def cmd_create(args): + """Create Azure VM with nested virtualization.""" + init_logging() + log("CREATE", f"Creating VM '{VM_NAME}' ({VM_SIZE})...") -def get_waa_path(args_path: str | None) -> Path: - """Get WAA path from args or auto-detect. + # Check if VM already exists + ip = get_vm_ip() + if ip: + log("CREATE", f"VM already exists: {ip}") + log("CREATE", "Use 'delete' first if you want to recreate") + return 0 - Args: - args_path: Path from command line args, or None. + # Try regions until one works + vm_created = False + for region in VM_REGIONS: + log("CREATE", f"Trying {region}...", end=" ") - Returns: - Resolved WAA path. + result = subprocess.run( + [ + "az", + "vm", + "create", + "--resource-group", + RESOURCE_GROUP, + "--name", + VM_NAME, + "--location", + region, + "--image", + "Ubuntu2204", + "--size", + VM_SIZE, + "--admin-username", + "azureuser", + "--generate-ssh-keys", + "--public-ip-sku", + "Standard", + ], + capture_output=True, + text=True, + ) - Raises: - SystemExit: If WAA cannot be found. - """ - if args_path: - path = Path(args_path) - if not path.exists(): - print(f"ERROR: WAA path does not exist: {path}") - sys.exit(1) - return path - - path = find_waa_path() - if path: - print(f" Using WAA from: {path}") - return path - - print("ERROR: Windows Agent Arena not found!") - print("\nTo fix, run:") - print(" git submodule update --init --recursive") - print("\nOr specify path manually:") - print(" --waa-path /path/to/WindowsAgentArena") - sys.exit(1) - - -def cmd_estimate(args: argparse.Namespace) -> None: - """Estimate Azure costs.""" - from openadapt_ml.benchmarks.azure import estimate_cost - - estimate = estimate_cost( - num_tasks=args.tasks, - num_workers=args.workers, - avg_task_duration_minutes=args.duration, - vm_hourly_cost=args.vm_cost, - ) + if result.returncode == 0: + vm_info = json.loads(result.stdout) + ip = vm_info.get("publicIpAddress", "") + log("CREATE", f"created ({ip})") + vm_created = True + break + else: + log("CREATE", "unavailable") + + if not vm_created: + log("CREATE", "ERROR: Could not create VM in any region") + return 1 + + # Wait for SSH + log("CREATE", "Waiting for SSH...") + if not wait_for_ssh(ip): + log("CREATE", "ERROR: SSH not available after 2 minutes") + return 1 + log("CREATE", "SSH ready") + + # Install Docker with /mnt storage + log("CREATE", "Installing Docker with /mnt storage...") + docker_setup = """ +set -e +sudo apt-get update -qq +sudo apt-get install -y -qq docker.io +sudo systemctl start docker +sudo systemctl enable docker +sudo usermod -aG docker $USER + +# Configure Docker to use /mnt (larger temp disk) +sudo systemctl stop docker +sudo mkdir -p /mnt/docker +sudo bash -c 'echo "{\\"data-root\\": \\"/mnt/docker\\"}" > /etc/docker/daemon.json' +sudo systemctl start docker - print("\n=== WAA Azure Cost Estimate ===") - print(f"Tasks: {estimate['num_tasks']}") - print(f"Workers: {estimate['num_workers']}") - print(f"Tasks per worker: {estimate['tasks_per_worker']:.1f}") - print( - f"Estimated duration: {estimate['estimated_duration_minutes']:.1f} minutes" - ) - print(f"Total VM hours: {estimate['total_vm_hours']:.2f}") - print(f"Estimated cost: ${estimate['estimated_cost_usd']:.2f}") - print(f"Cost per task: ${estimate['cost_per_task_usd']:.4f}") - print() +# Verify +docker --version +df -h /mnt +""" + result = ssh_run(ip, docker_setup, stream=True, step="CREATE") + if result.returncode != 0: + log("CREATE", "ERROR: Docker setup failed") + return 1 + log("CREATE", f"VM ready: {ip}") + return 0 -def cmd_az_status(args: argparse.Namespace) -> None: - """Check Azure resource status for WAA benchmark deployment.""" - import subprocess - def run_az(cmd: list[str], description: str) -> tuple[bool, str]: - """Run an az command and return (success, output).""" - try: - result = subprocess.run( - ["az"] + cmd, - capture_output=True, - text=True, - timeout=30, - ) - return ( - result.returncode == 0, - result.stdout.strip() or result.stderr.strip(), - ) - except FileNotFoundError: - return False, "Azure CLI not installed" - except subprocess.TimeoutExpired: - return False, "Command timed out" - except Exception as e: - return False, str(e) - - print("\n=== Azure WAA Benchmark Status ===\n") - - # Check Azure CLI - ok, output = run_az(["--version"], "Azure CLI version") - if ok: - version = output.split("\n")[0] if output else "unknown" - print(f" Azure CLI: ✓ {version}") - else: - print(" Azure CLI: ✗ Not installed") - print(" Install: brew install azure-cli") - return +def cmd_delete(args): + """Delete VM and ALL associated resources.""" + init_logging() + log("DELETE", f"Deleting VM '{VM_NAME}' and all associated resources...") - # Check login - ok, output = run_az( - ["account", "show", "--query", "name", "-o", "tsv"], "Azure login" + # Delete VM + log("DELETE", "Deleting VM...") + result = subprocess.run( + [ + "az", + "vm", + "delete", + "-g", + RESOURCE_GROUP, + "-n", + VM_NAME, + "--yes", + "--force-deletion", + "true", + ], + capture_output=True, + text=True, ) - if ok: - print(f" Logged in: ✓ {output}") + if result.returncode == 0: + log("DELETE", "VM deleted") else: - print(" Logged in: ✗ Run: az login") - return - - # Check resource group - rg = args.resource_group - ok, output = run_az( - ["group", "show", "--name", rg, "--query", "location", "-o", "tsv"], - "Resource group", + log("DELETE", "VM not found or already deleted") + + # Delete NICs + log("DELETE", "Deleting NICs...") + result = subprocess.run( + [ + "az", + "network", + "nic", + "list", + "-g", + RESOURCE_GROUP, + "--query", + "[?contains(name, 'waa')].name", + "-o", + "tsv", + ], + capture_output=True, + text=True, ) - if ok: - print(f" Resource group: ✓ {rg} ({output})") - else: - print(f" Resource group: ✗ {rg} not found") - print(" Run: python scripts/setup_azure.py") - return + for nic in result.stdout.strip().split("\n"): + if nic: + subprocess.run( + ["az", "network", "nic", "delete", "-g", RESOURCE_GROUP, "-n", nic], + capture_output=True, + ) + log("DELETE", f" Deleted NIC: {nic}") - # Check ML workspace - ws = args.workspace - ok, output = run_az( + # Delete public IPs + log("DELETE", "Deleting public IPs...") + result = subprocess.run( [ - "ml", - "workspace", - "show", - "--name", - ws, - "--resource-group", - rg, + "az", + "network", + "public-ip", + "list", + "-g", + RESOURCE_GROUP, "--query", - "location", + "[?contains(name, 'waa')].name", "-o", "tsv", ], - "ML workspace", + capture_output=True, + text=True, ) - if ok: - print(f" ML workspace: ✓ {ws} ({output})") - else: - print(f" ML workspace: ✗ {ws} not found") + for pip in result.stdout.strip().split("\n"): + if pip: + subprocess.run( + [ + "az", + "network", + "public-ip", + "delete", + "-g", + RESOURCE_GROUP, + "-n", + pip, + ], + capture_output=True, + ) + log("DELETE", f" Deleted IP: {pip}") - # Check ACR - acr = args.acr_name - ok, output = run_az( + # Delete disks + log("DELETE", "Deleting disks...") + result = subprocess.run( [ - "acr", - "show", - "--name", - acr, - "--resource-group", - rg, + "az", + "disk", + "list", + "-g", + RESOURCE_GROUP, "--query", - "loginServer", + "[?contains(name, 'waa')].name", "-o", "tsv", ], - "Container registry", + capture_output=True, + text=True, ) - if ok: - print(f" Container registry: ✓ {output}") - else: - print(f" Container registry: ✗ {acr} not found") + for disk in result.stdout.strip().split("\n"): + if disk: + subprocess.run( + ["az", "disk", "delete", "-g", RESOURCE_GROUP, "-n", disk, "--yes"], + capture_output=True, + ) + log("DELETE", f" Deleted disk: {disk}") - # Check WAA Docker image - ok, output = run_az( + # Delete NSGs + log("DELETE", "Deleting NSGs...") + result = subprocess.run( [ - "acr", - "repository", - "show", - "--name", - acr, - "--repository", - "winarena", + "az", + "network", + "nsg", + "list", + "-g", + RESOURCE_GROUP, "--query", - "imageName", + "[?contains(name, 'waa')].name", "-o", "tsv", ], - "WAA Docker image", + capture_output=True, + text=True, ) - if ok: - print(" WAA Docker image: ✓ winarena") - else: - print(" WAA Docker image: ✗ Not imported") - print(" Run: python scripts/setup_azure.py") - - # Check .env file - env_path = Path(".env") - if env_path.exists(): - env_content = env_path.read_text() - has_azure = "AZURE_SUBSCRIPTION_ID" in env_content - print( - f" .env file: ✓ {'Azure credentials found' if has_azure else 'Missing Azure credentials'}" - ) - else: - print(" .env file: ✗ Not found") + for nsg in result.stdout.strip().split("\n"): + if nsg: + subprocess.run( + ["az", "network", "nsg", "delete", "-g", RESOURCE_GROUP, "-n", nsg], + capture_output=True, + ) + log("DELETE", f" Deleted NSG: {nsg}") - # Check WAA submodule - waa_path = find_waa_path() - if waa_path: - # Count tasks - from openadapt_ml.benchmarks import WAAAdapter + log("DELETE", "Cleanup complete") + return 0 - try: - adapter = WAAAdapter(waa_repo_path=waa_path) - task_count = len(adapter.list_tasks()) - print(f" WAA submodule: ✓ {task_count} tasks at {waa_path}") - except Exception as e: - print(f" WAA submodule: ⚠ Found but error: {e}") - else: - print(" WAA submodule: ✗ Not found") - print(" Run: git submodule update --init --recursive") - print() - print( - "Ready for benchmark evaluation!" - if ok - else "Some resources missing - run setup_azure.py" - ) +def cmd_status(args): + """Show VM status.""" + ip = get_vm_ip() + state = get_vm_state() + if not ip: + print(f"VM '{VM_NAME}' not found") + return 1 -def cmd_run_local(args: argparse.Namespace) -> None: - """Run evaluation locally on Windows.""" - from openadapt_ml.benchmarks import ( - RandomAgent, - WAAAdapter, - compute_metrics, - evaluate_agent_on_benchmark, - ) + print(f"VM: {VM_NAME}") + print(f" State: {state or 'unknown'}") + print(f" IP: {ip}") + print(f" Size: {VM_SIZE}") + print(f" SSH: ssh azureuser@{ip}") + return 0 - # Check platform - if sys.platform != "win32" and not args.force: - print("ERROR: WAA requires Windows. Use --force to override.") - sys.exit(1) - # Parse task IDs - task_ids = None - if args.tasks: - task_ids = [t.strip() for t in args.tasks.split(",")] +def cmd_build(args): + """Build WAA image from waa_deploy/Dockerfile. - # Get WAA path (auto-detect if not specified) - waa_path = get_waa_path(args.waa_path) + This builds our custom image that: + - Uses dockurr/windows:latest (has working ISO auto-download) + - Copies WAA components from windowsarena/winarena:latest + - Patches IP addresses and adds automation + """ + init_logging() - # Create adapter - adapter = WAAAdapter(waa_repo_path=waa_path) + ip = get_vm_ip() + if not ip: + log("BUILD", "ERROR: VM not found. Run 'create' first.") + return 1 - # Create agent (for now, just random - in practice, would load a model) - if args.agent == "random": - agent = RandomAgent(seed=args.seed) - else: - print(f"ERROR: Unknown agent type: {args.agent}") - sys.exit(1) - - # Run evaluation - print("\nRunning WAA evaluation...") - print(f" WAA path: {waa_path}") - print(f" Tasks: {len(task_ids) if task_ids else 'all (154)'}") - print(f" Max steps: {args.max_steps}") - print() + log("BUILD", "Building WAA image from waa_deploy/Dockerfile...") - results = evaluate_agent_on_benchmark( - agent=agent, - adapter=adapter, - task_ids=task_ids, - max_steps=args.max_steps, - ) + # Check Dockerfile exists + if not DOCKERFILE_PATH.exists(): + log("BUILD", f"ERROR: Dockerfile not found: {DOCKERFILE_PATH}") + return 1 - # Print results - metrics = compute_metrics(results) - print("\n=== Results ===") - print(f"Tasks: {metrics['num_tasks']}") - print(f"Success rate: {metrics['success_rate']:.1%}") - print(f"Avg score: {metrics['avg_score']:.3f}") - print(f"Avg steps: {metrics['avg_steps']:.1f}") - print() + # Copy Dockerfile and supporting files to VM + log("BUILD", "Copying build files to VM...") + ssh_run(ip, "mkdir -p ~/build") - # Save results - if args.output: - output_path = Path(args.output) - with open(output_path, "w") as f: - json.dump( - { - "metrics": metrics, - "results": [ - { - "task_id": r.task_id, - "success": r.success, - "score": r.score, - "num_steps": r.num_steps, - "error": r.error, - } - for r in results - ], - }, - f, - indent=2, + waa_deploy_dir = DOCKERFILE_PATH.parent + files_to_copy = ["Dockerfile", "start_waa_server.bat", "api_agent.py"] + for filename in files_to_copy: + src = waa_deploy_dir / filename + if src.exists(): + result = subprocess.run( + ["scp", *SSH_OPTS, str(src), f"azureuser@{ip}:~/build/"], + capture_output=True, + text=True, ) - print(f"Results saved to: {output_path}") + if result.returncode != 0: + log("BUILD", f"ERROR: Failed to copy {filename}: {result.stderr}") + return 1 + # Pre-build cleanup + log("BUILD", "Cleaning up dangling images before build...") + ssh_run(ip, "docker image prune -f 2>/dev/null") -def _get_azure_ml_studio_url( - subscription_id: str, - resource_group: str, - workspace_name: str, - view: str = "compute", -) -> str: - """Get Azure ML Studio URL for a workspace. + # Build image (streams output) + log("BUILD", "Running docker build (this takes ~10-15 minutes)...") + build_cmd = f"cd ~/build && docker build --pull -t {DOCKER_IMAGE} . 2>&1" + result = ssh_run(ip, build_cmd, stream=True, step="BUILD") - Args: - subscription_id: Azure subscription ID - resource_group: Resource group name - workspace_name: ML workspace name - view: Which view to open - "compute", "jobs", "overview" + if result.returncode != 0: + log("BUILD", "ERROR: Docker build failed") + return 1 - Returns: - Azure ML Studio URL - """ - workspace_id = ( - f"/subscriptions/{subscription_id}" - f"/resourceGroups/{resource_group}" - f"/providers/Microsoft.MachineLearningServices/workspaces/{workspace_name}" - ) + # Post-build cleanup + log("BUILD", "Cleaning up dangling images after build...") + ssh_run(ip, "docker image prune -f 2>/dev/null") - # Azure ML Studio URL format - # The experiments page shows all jobs for this workspace - # Format: https://ml.azure.com/experiments/id/{experiment_id}?wsid={workspace_id} - # NOTE: This experiment_id is specific to the openadapt-ml workspace - # TODO: Retrieve experiment_id dynamically from Azure instead of hardcoding - experiment_id = "ad29082c-0607-4fda-8cc7-38944eb5a518" - return f"https://ml.azure.com/experiments/id/{experiment_id}?wsid={workspace_id}" - - -def _write_azure_job_status( - output_dir: Path, - job_id: str, - status: str, - workers: int, - num_tasks: int, - task_ids: list[str] | None, - azure_url: str, - start_time: str | None = None, - end_time: str | None = None, - results: dict | None = None, -) -> None: - """Write Azure job status to a JSON file for the benchmark viewer.""" - import datetime - - jobs_file = output_dir / "azure_jobs.json" - - # Load existing jobs - jobs = [] - if jobs_file.exists(): - try: - with open(jobs_file) as f: - jobs = json.load(f) - except json.JSONDecodeError: - jobs = [] - - # Find or create this job - job_entry = None - for job in jobs: - if job.get("job_id") == job_id: - job_entry = job - break + log("BUILD", f"Image built: {DOCKER_IMAGE}") + return 0 - if job_entry is None: - job_entry = { - "job_id": job_id, - "started_at": start_time or datetime.datetime.now().isoformat(), - } - jobs.insert(0, job_entry) # Most recent first - - # Update job entry - job_entry.update( - { - "status": status, - "workers": workers, - "num_tasks": num_tasks, - "task_ids": task_ids[:5] - if task_ids and len(task_ids) > 5 - else task_ids, # First 5 for display - "azure_dashboard_url": azure_url, - "updated_at": datetime.datetime.now().isoformat(), - } - ) - if end_time: - job_entry["ended_at"] = end_time - if results: - job_entry["results"] = results +def cmd_start(args): + """Start WAA container.""" + init_logging() - # Keep only last 10 jobs - jobs = jobs[:10] + ip = get_vm_ip() + if not ip: + log("START", "ERROR: VM not found. Run 'create' first.") + return 1 - # Write back - output_dir.mkdir(parents=True, exist_ok=True) - with open(jobs_file, "w") as f: - json.dump(jobs, f, indent=2) + log("START", "Starting WAA container...") + # Stop existing container + log("START", "Stopping any existing container...") + ssh_run(ip, "docker stop winarena 2>/dev/null; docker rm -f winarena 2>/dev/null") -def cmd_run_azure(args: argparse.Namespace) -> None: - """Run evaluation on Azure.""" - import datetime - import random - from openadapt_ml.benchmarks import RandomAgent, WAAAdapter - from openadapt_ml.benchmarks.azure import AzureConfig, AzureWAAOrchestrator + # Clean storage if --fresh + if args.fresh: + log("START", "Cleaning storage for fresh Windows install...") + ssh_run(ip, "sudo rm -rf /mnt/waa-storage/*") + + # Create storage directory + ssh_run( + ip, + "sudo mkdir -p /mnt/waa-storage && sudo chown azureuser:azureuser /mnt/waa-storage", + ) + + # Start container + # Our custom image has ENTRYPOINT that handles everything: + # - Downloads Windows 11 Enterprise if not present + # - Boots QEMU VM + # - Runs WAA server automatically via FirstLogonCommands + log("START", "Starting container with VERSION=11e...") + docker_cmd = f"""docker run -d \\ + --name winarena \\ + --device=/dev/kvm \\ + --cap-add NET_ADMIN \\ + -p 8006:8006 \\ + -p 5000:5000 \\ + -p 7200:7200 \\ + -v /mnt/waa-storage:/storage \\ + -e VERSION=11e \\ + -e RAM_SIZE=8G \\ + -e CPU_CORES=4 \\ + -e DISK_SIZE=64G \\ + {DOCKER_IMAGE}""" - # Load config - if args.config: - config = AzureConfig.from_json(args.config) - else: - config = AzureConfig.from_env() - - # Get WAA path (auto-detect if not specified) - waa_path = get_waa_path(args.waa_path) - - # Load WAA adapter to get available tasks - adapter = WAAAdapter(waa_repo_path=waa_path) - all_tasks = adapter.list_tasks() # Returns list[BenchmarkTask] - all_task_ids = [t.task_id for t in all_tasks] # Extract task_id strings - print(f" Available tasks: {len(all_task_ids)}") - - # Determine which tasks to run - task_ids = None - if args.task_ids: - # Specific task IDs provided - task_ids = [t.strip() for t in args.task_ids.split(",")] - # Validate task IDs exist - invalid = [t for t in task_ids if t not in all_task_ids] - if invalid: - print(f"ERROR: Invalid task IDs: {invalid[:5]}...") - print(f" Available tasks start with: {all_task_ids[:3]}") - sys.exit(1) - elif args.num_tasks: - # Select random subset of tasks - random.seed(args.seed) - num_to_select = min(args.num_tasks, len(all_task_ids)) - task_ids = random.sample(all_task_ids, num_to_select) # Sample from string IDs - print(f" Selected {num_to_select} random tasks") - - # Create orchestrator - orchestrator = AzureWAAOrchestrator( - config=config, - waa_repo_path=waa_path, - experiment_name=args.experiment, - ) + result = ssh_run(ip, docker_cmd) + if result.returncode != 0: + log("START", f"ERROR: Failed to start container: {result.stderr}") + return 1 - # Create agent - if args.agent == "random": - agent = RandomAgent(seed=args.seed) - else: - print(f"ERROR: Unknown agent type: {args.agent}") - sys.exit(1) + log("START", "Container started") + log("START", "Windows will boot and install (15-20 min on first run)") + log("START", "Monitor via: uv run python -m openadapt_ml.benchmarks.cli_v2 logs") + log("START", f"VNC (via SSH tunnel): ssh -L 8006:localhost:8006 azureuser@{ip}") + return 0 - # Estimate costs first - from openadapt_ml.benchmarks.azure import estimate_cost - num_tasks = len(task_ids) if task_ids else len(all_task_ids) - estimate = estimate_cost(num_tasks=num_tasks, num_workers=args.workers) +def cmd_stop(args): + """Stop and remove WAA container.""" + ip = get_vm_ip() + if not ip: + print("ERROR: VM not found") + return 1 - print("\n=== Azure WAA Evaluation ===") - print(f" Workers: {args.workers}") - print(f" Tasks: {num_tasks}") - print(f" Job timeout: {args.timeout} hours") - print(f" Estimated cost: ${estimate['estimated_cost_usd']:.2f}") - print(f" Estimated time: {estimate['estimated_duration_minutes']:.1f} minutes") - print() + print(f"Stopping container on VM ({ip})...") - if not args.yes: - response = input("Proceed? [y/N] ") - if response.lower() != "y": - print("Aborted.") - sys.exit(0) - - # Generate job ID and Azure dashboard URL - job_id = f"waa_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}" - azure_url = _get_azure_ml_studio_url( - subscription_id=config.subscription_id, - resource_group=config.resource_group, - workspace_name=config.workspace_name, - view="compute", + # Stop container + result = ssh_run( + ip, "docker stop winarena 2>/dev/null && echo STOPPED || echo NOT_RUNNING" ) - output_dir = Path("benchmark_results") - start_time = datetime.datetime.now().isoformat() - - # Write initial job status - _write_azure_job_status( - output_dir=output_dir, - job_id=job_id, - status="provisioning", - workers=args.workers, - num_tasks=num_tasks, - task_ids=task_ids, - azure_url=azure_url, - start_time=start_time, - ) - - # Run evaluation - print("\nStarting Azure evaluation...") - print(f" Job ID: {job_id}") - print(f" Monitor at: {azure_url}") - print(" (VM provisioning takes 3-5 minutes)") - print() + if "STOPPED" in result.stdout: + print(" Container stopped") + else: + print(" Container was not running") - try: - # Update status to running once provisioning starts - _write_azure_job_status( - output_dir=output_dir, - job_id=job_id, - status="running", - workers=args.workers, - num_tasks=num_tasks, - task_ids=task_ids, - azure_url=azure_url, - ) + # Remove container + result = ssh_run( + ip, "docker rm -f winarena 2>/dev/null && echo REMOVED || echo NOT_FOUND" + ) + if "REMOVED" in result.stdout: + print(" Container removed") + else: + print(" Container already removed") + + # Optionally clean storage + if hasattr(args, "clean") and args.clean: + print(" Cleaning Windows storage...") + ssh_run(ip, "sudo rm -rf /mnt/waa-storage/*") + print(" Storage cleaned") + + print("Done") + return 0 + + +def cmd_probe(args): + """Check if WAA server is ready.""" + ip = get_vm_ip() + if not ip: + print("ERROR: VM not found") + return 1 + + timeout = args.timeout + start = time.time() + last_storage = None + + while True: + # Check via SSH - must run curl INSIDE container to reach Docker network + result = ssh_run( + ip, + "docker exec winarena curl -s --max-time 5 http://172.30.0.2:5000/probe 2>/dev/null || echo FAIL", + ) + + if "FAIL" not in result.stdout and result.stdout.strip(): + print("\nWAA server is READY") + print(f" Response: {result.stdout.strip()[:100]}") + return 0 + + if not args.wait: + print("WAA server is NOT ready") + return 1 + + elapsed = time.time() - start + if elapsed > timeout: + print(f"\nTIMEOUT: WAA server not ready after {timeout}s") + return 1 + + # Get detailed status for progress display + elapsed_min = int(elapsed // 60) + elapsed_sec = int(elapsed % 60) + + # Get storage in bytes for detailed view + storage_result = ssh_run( + ip, "docker exec winarena du -sb /storage/ 2>/dev/null | cut -f1" + ) + storage_bytes = storage_result.stdout.strip() + if storage_bytes.isdigit(): + storage_mb = int(storage_bytes) / (1024 * 1024) + storage_str = f"{storage_mb:,.1f} MB" + # Show delta if we have previous value + if last_storage is not None: + delta = int(storage_bytes) - last_storage + if delta > 0: + delta_mb = delta / (1024 * 1024) + storage_str += f" (+{delta_mb:,.1f} MB)" + last_storage = int(storage_bytes) + else: + storage_str = "unknown" - results = orchestrator.run_evaluation( - agent=agent, - num_workers=args.workers, - task_ids=task_ids, - max_steps_per_task=args.max_steps, - cleanup_on_complete=not args.no_cleanup, - timeout_hours=args.timeout, + # Get QEMU uptime + qemu_result = ssh_run( + ip, + 'docker exec winarena sh -c \'QPID=$(pgrep -f qemu-system 2>/dev/null | head -1); [ -n "$QPID" ] && ps -o etime= -p $QPID 2>/dev/null | tr -d " " || echo N/A\'', ) + qemu_uptime = qemu_result.stdout.strip() or "N/A" - # Print results - from openadapt_ml.benchmarks import compute_metrics - - metrics = compute_metrics(results) - print("\n=== Results ===") - print(f"Tasks: {metrics['num_tasks']}") - print(f"Success rate: {metrics['success_rate']:.1%}") - print(f"Avg score: {metrics['avg_score']:.3f}") - print() - - # Update job status to completed - _write_azure_job_status( - output_dir=output_dir, - job_id=job_id, - status="completed", - workers=args.workers, - num_tasks=num_tasks, - task_ids=task_ids, - azure_url=azure_url, - end_time=datetime.datetime.now().isoformat(), - results={ - "success_rate": metrics.get("success_rate", 0.0), - "num_success": metrics.get("success_count", 0), - "avg_score": metrics.get("avg_score", 0.0), - }, + # Get container uptime + container_result = ssh_run( + ip, "docker ps --filter name=winarena --format '{{.Status}}' 2>/dev/null" ) + container_status = container_result.stdout.strip() or "unknown" - # Save results - if args.output: - output_path = Path(args.output) - with open(output_path, "w") as f: - json.dump( - { - "metrics": metrics, - "run_status": orchestrator.get_run_status(), - "results": [ - { - "task_id": r.task_id, - "success": r.success, - "score": r.score, - "num_steps": r.num_steps, - } - for r in results - ], - }, - f, - indent=2, - ) - print(f"Results saved to: {output_path}") - - except Exception as e: - # Update job status to failed - _write_azure_job_status( - output_dir=output_dir, - job_id=job_id, - status="failed", - workers=args.workers, - num_tasks=num_tasks, - task_ids=task_ids, - azure_url=azure_url, - end_time=datetime.datetime.now().isoformat(), - results={"error": str(e)}, + print( + f"[{elapsed_min:02d}:{elapsed_sec:02d}] Waiting... | Storage: {storage_str} | QEMU: {qemu_uptime} | Container: {container_status}" ) - raise - - -def cmd_test_mock(args: argparse.Namespace) -> None: - """Test with mock adapter (no Windows required).""" - from openadapt_ml.benchmarks import ( - RandomAgent, - WAAMockAdapter, - compute_domain_metrics, - compute_metrics, - evaluate_agent_on_benchmark, - ) - - print("\n=== Testing with Mock Adapter ===") - print(f" Tasks: {args.tasks}") - print(f" Max steps: {args.max_steps}") - print() - - # Create mock adapter - adapter = WAAMockAdapter(num_tasks=args.tasks) - agent = RandomAgent(seed=args.seed) - - # Run evaluation - results = evaluate_agent_on_benchmark( - agent=agent, - adapter=adapter, - max_steps=args.max_steps, - ) - - # Print results - metrics = compute_metrics(results) - print("=== Results ===") - print(f"Tasks: {metrics['num_tasks']}") - print(f"Success rate: {metrics['success_rate']:.1%}") - print(f"Successes: {metrics['success_count']}") - print(f"Failures: {metrics['fail_count']}") - print(f"Avg steps: {metrics['avg_steps']:.1f}") - print() - - # Domain breakdown - tasks = adapter.list_tasks() - domain_metrics = compute_domain_metrics(results, tasks) - if domain_metrics: - print("=== By Domain ===") - for domain, dm in domain_metrics.items(): - print( - f" {domain}: {dm['success_rate']:.1%} ({dm['success_count']}/{dm['num_tasks']})" - ) - print() - - -def cmd_test_smart(args: argparse.Namespace) -> None: - """Test with SmartMockAgent (expected 100% success).""" - from openadapt_ml.benchmarks import ( - SmartMockAgent, - WAAMockAdapter, - evaluate_agent_on_benchmark, - ) - - print("\n=== Testing with SmartMockAgent ===") - print(f" Tasks: {args.tasks}") - print(f" Max steps: {args.max_steps}") - print() - - # Create mock adapter and smart agent - adapter = WAAMockAdapter(num_tasks=args.tasks) - agent = SmartMockAgent() - - # Run evaluation - results = evaluate_agent_on_benchmark( - agent=agent, - adapter=adapter, - max_steps=args.max_steps, - ) - - # Print results - success_count = sum(1 for r in results if r.success) - print("=== Results ===") - print( - f"Success rate: {success_count}/{len(results)} ({100 * success_count / len(results):.0f}%)" - ) - - if success_count != len(results): - print("\nWARNING: Expected 100% success with SmartMockAgent") - for r in results: - if not r.success: - print(f" FAIL {r.task_id}: {r.reason}") - print() + time.sleep(30) -def cmd_test_collection(args: argparse.Namespace) -> None: - """Test benchmark data collection with mock adapter. +def cmd_run(args): + """Run benchmark tasks using vanilla WAA's navi agent. - This command runs a benchmark evaluation with data collection enabled, - creating a full directory structure with screenshots, execution traces, - and metadata suitable for the benchmark viewer. + Note: For API-based agents (Claude, GPT-4 direct), use openadapt-evals + which communicates with WAA's Flask API externally. """ - import json - from pathlib import Path - - from openadapt_ml.benchmarks import RandomAgent, WAAMockAdapter - from openadapt_ml.benchmarks.runner import ( - EvaluationConfig, - evaluate_agent_on_benchmark, - ) - - print("\n=== Testing Benchmark Data Collection ===") - print(f" Tasks: {args.tasks}") - print(f" Max steps: {args.max_steps}") - print(f" Output dir: {args.output}") - print(f" Run name: {args.run_name or '(auto-generated)'}") - print() - - # Create mock adapter - adapter = WAAMockAdapter(num_tasks=args.tasks, domains=["browser", "office"]) - agent = RandomAgent( - action_types=["click", "type", "scroll", "done"], seed=args.seed - ) + init_logging() - # Configure evaluation with data collection - config = EvaluationConfig( - max_steps=args.max_steps, - parallel=1, - save_trajectories=True, - save_execution_traces=True, - model_id=args.model_id, - output_dir=args.output, - run_name=args.run_name, - verbose=True, - ) + ip = get_vm_ip() + if not ip: + log("RUN", "ERROR: VM not found") + return 1 - # Run evaluation - results = evaluate_agent_on_benchmark( - agent=agent, - adapter=adapter, - config=config, + # Check WAA is ready + log("RUN", "Checking WAA server...") + result = ssh_run( + ip, + "docker exec winarena curl -s --max-time 5 http://172.30.0.2:5000/probe 2>/dev/null || echo FAIL", ) + if "FAIL" in result.stdout or not result.stdout.strip(): + log("RUN", "ERROR: WAA server not ready. Run 'probe --wait' first.") + return 1 - # Print results - success_count = sum(1 for r in results if r.success) - success_rate = success_count / len(results) if results else 0.0 - avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0.0 - - print("\n=== Results ===") - print(f"Total tasks: {len(results)}") - print(f"Success: {success_count} ({success_rate:.1%})") - print(f"Failure: {len(results) - success_count}") - print(f"Avg steps: {avg_steps:.1f}") - - # Find the actual output directory by reading metadata - output_dir = Path(args.output) - run_dirs = sorted( - output_dir.glob("*/metadata.json"), - key=lambda p: p.stat().st_mtime, - reverse=True, - ) - if run_dirs: - run_dir = run_dirs[0].parent - with open(run_dirs[0]) as f: - metadata = json.load(f) - metadata.get("run_name", run_dir.name) - else: - run_dir = output_dir - - print("\n=== Output Directory ===") - print(f"Location: {run_dir.absolute()}") - print("\nDirectory structure:") - print(f" {run_dir.name}/") - print(" ├── metadata.json") - print(" ├── summary.json") - print(" └── tasks/") - print(" ├── task_001/") - print(" │ ├── task.json") - print(" │ ├── execution.json") - print(" │ └── screenshots/") - print(" │ ├── step_000.png") - print(" │ ├── step_001.png") - print(" │ └── ...") - print(" └── ...") - print(f"\nYou can inspect the results at: {run_dir.absolute()}") - print() + log("RUN", "WAA server is ready") + # Get API key (navi uses GPT-4o for reasoning) + api_key = args.api_key + if not api_key: + try: + from openadapt_ml.config import settings -def cmd_waa_demo(args: argparse.Namespace) -> None: - """Run WAA demo-conditioned experiment. + api_key = settings.openai_api_key or "" + except ImportError: + api_key = "" - This runs the demo-conditioned prompting experiment comparing - zero-shot vs demo-conditioned performance on WAA tasks. + if not api_key: + log("RUN", "ERROR: OpenAI API key required (navi uses GPT-4o)") + log("RUN", " Set OPENAI_API_KEY in .env file or pass --api-key") + return 1 + + # Build task selection + domain = args.domain + task = args.task + model = args.model + + task_info = [] + if task: + task_info.append(f"task={task}") + elif domain != "all": + task_info.append(f"domain={domain}") + else: + task_info.append(f"{args.num_tasks} task(s)") - The experiment validates that including task demonstrations - significantly improves first-action accuracy. - """ - from openadapt_ml.experiments.waa_demo.runner import ( - DemoConditionedAgent, - get_complete_demos, - get_task, - ) - from openadapt_ml.benchmarks import ( - WAAMockAdapter, - compute_metrics, - ) - from openadapt_ml.benchmarks.runner import ( - EvaluationConfig, - evaluate_agent_on_benchmark, - ) + log("RUN", f"Starting benchmark: {', '.join(task_info)}, model={model}") - print("\n=== WAA Demo-Conditioned Experiment ===") - print(f" Condition: {args.condition}") - print(f" Provider: {args.provider}") - print(f" Tasks: {args.tasks or 'all with demos'}") - print(f" Max steps: {args.max_steps}") - print() + # Build run.py arguments + run_args = [ + "--agent_name navi", + f"--model {model}", + f"--domain {domain}", + ] - # Determine which tasks to run - task_ids = None - if args.tasks: - task_nums = [t.strip() for t in args.tasks.split(",")] - task_ids = [] - for num in task_nums: - task = get_task(num) - if task: - task_ids.append(task.task_id) - else: - print(f" Warning: Task {num} not found") - else: - complete_demos = get_complete_demos() - task_ids = [] - for num in complete_demos.keys(): - task = get_task(num) - if task: - task_ids.append(task.task_id) - print(f" Running {len(task_ids)} tasks with complete demos") - - # Determine adapter - if args.mock: - print(" Using mock adapter (no Windows required)") - adapter = WAAMockAdapter(num_tasks=len(task_ids) if task_ids else 10) - task_ids = None + # If specific task requested, create custom test config + if task: + create_custom_test_cmd = f''' +cat > /client/evaluation_examples_windows/test_custom.json << 'CUSTOMEOF' +["{task}"] +CUSTOMEOF +''' + run_args.append( + "--test_all_meta_path evaluation_examples_windows/test_custom.json" + ) + pre_cmd = create_custom_test_cmd + elif args.num_tasks and args.num_tasks < 154: + # Limit tasks by creating custom test config with first N tasks + num = args.num_tasks + # Write a temp Python script then run it (avoids quote escaping hell) + # test_all.json is a dict {{domain: [task_ids...]}} - preserve domain structure + create_limited_test_cmd = f"""cat > /tmp/limit_tasks.py << LIMITEOF +import json +d = json.load(open("/client/evaluation_examples_windows/test_all.json")) +# Collect (domain, task_id) pairs to preserve domain info +all_tasks = [] +for domain, tasks in d.items(): + for task in tasks: + all_tasks.append((domain, task)) +# Limit total tasks +limited = all_tasks[:{num}] +# Rebuild dict preserving original domain structure +result = {{}} +for domain, task in limited: + if domain not in result: + result[domain] = [] + result[domain].append(task) +json.dump(result, open("/client/evaluation_examples_windows/test_limited.json", "w")) +print("Limited to", len(limited), "tasks from", len(result), "domains") +LIMITEOF +python /tmp/limit_tasks.py && """ + run_args.append( + "--test_all_meta_path evaluation_examples_windows/test_limited.json" + ) + pre_cmd = create_limited_test_cmd else: - # Auto-detect WAA - waa_path = find_waa_path() - if waa_path and sys.platform == "win32": - from openadapt_ml.benchmarks import WAAAdapter + pre_cmd = "" - print(f" Using real WAA from: {waa_path}") - adapter = WAAAdapter(waa_repo_path=waa_path) - else: - print(" WAA not available, using mock adapter") - adapter = WAAMockAdapter(num_tasks=len(task_ids) if task_ids else 10) - task_ids = None - - # Create agent - agent = DemoConditionedAgent( - provider=args.provider, - condition=args.condition, - max_tokens=args.max_tokens, - use_accessibility_tree=not args.no_a11y, - use_history=not args.no_history, + # Run the benchmark inside the container + run_cmd = ( + f'export OPENAI_API_KEY="{api_key}" && ' + f"docker exec -e OPENAI_API_KEY winarena " + f"bash -c '{pre_cmd}cd /client && python run.py {' '.join(run_args)}'" ) - # Configure evaluation - model_id = f"{args.provider}-{args.condition}" - config = EvaluationConfig( - max_steps=args.max_steps, - parallel=1, - save_trajectories=True, - save_execution_traces=True, - model_id=model_id, - output_dir=args.output, - run_name=args.run_name, - verbose=args.verbose, - ) + log("RUN", "Executing benchmark...") + log("RUN", f" Model: {model}") + log("RUN", f" Tasks: {task_info[0]}") + log("RUN", "-" * 60) - print() - print("Starting evaluation...") - print("(Each step calls the VLM API - this may take a while)") - print() + # Run with streaming output + result = ssh_run(ip, run_cmd, stream=True, step="RUN") - try: - results = evaluate_agent_on_benchmark( - agent=agent, - adapter=adapter, - task_ids=task_ids, - config=config, - ) - except Exception as e: - print(f"\nERROR: {e}") - key_name = ( - "ANTHROPIC_API_KEY" if args.provider == "anthropic" else "OPENAI_API_KEY" - ) - if "API key" in str(e) or "api_key" in str(e).lower(): - print(f"\nMake sure {key_name} is set in your environment or .env file.") - sys.exit(1) - - # Print results - metrics = compute_metrics(results) - print("\n=== Results ===") - print(f"Condition: {args.condition}") - print(f"Tasks: {metrics['num_tasks']}") - print(f"Success rate: {metrics['success_rate']:.1%}") - print(f"Successes: {metrics['success_count']}") - print(f"Failures: {metrics['fail_count']}") - print(f"Avg steps: {metrics['avg_steps']:.1f}") - print() + if result.returncode != 0: + log("RUN", f"Benchmark failed with exit code {result.returncode}") + else: + log("RUN", "Benchmark completed!") - # Per-task results - if args.verbose: - print("Per-task results:") - for result in results: - status = "PASS" if result.success else "FAIL" - print(f" {result.task_id}: {status} ({result.num_steps} steps)") - print() + # Download results unless --no-download + if not args.no_download: + log("RUN", "Downloading results...") + download_benchmark_results(ip) - # Output location - output_dir = Path(args.output) - run_dirs = sorted( - output_dir.glob("*/metadata.json"), - key=lambda p: p.stat().st_mtime, - reverse=True, - ) - if run_dirs: - run_dir = run_dirs[0].parent - print(f"Results saved to: {run_dir.absolute()}") - print() + return result.returncode -def cmd_run_api(args: argparse.Namespace) -> None: - """Run evaluation using API-backed VLM (Claude/GPT-5.1). +def download_benchmark_results(ip: str) -> str: + """Download benchmark results from the container. - This provides baselines for comparing against fine-tuned models. + Results are saved to benchmark_results/waa_results_TIMESTAMP/ + Returns the path to the results directory, or None if failed. """ - from openadapt_ml.benchmarks import ( - APIBenchmarkAgent, - WAAMockAdapter, - compute_domain_metrics, - compute_metrics, - ) - from openadapt_ml.benchmarks.runner import ( - EvaluationConfig, - evaluate_agent_on_benchmark, - ) - - provider_names = { - "anthropic": "Claude", - "openai": "GPT-5.1", - } - - print("\n=== API-Backed Benchmark Evaluation ===") - print( - f" Provider: {args.provider} ({provider_names.get(args.provider, 'Unknown')})" - ) - print(f" Tasks: {args.tasks}") - print(f" Max steps: {args.max_steps}") - print(f" Output dir: {args.output}") + from pathlib import Path - # Check for API key - import os + # Create local results directory with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + results_dir = Path("benchmark_results") / f"waa_results_{timestamp}" + results_dir.mkdir(parents=True, exist_ok=True) - key_name = "ANTHROPIC_API_KEY" if args.provider == "anthropic" else "OPENAI_API_KEY" - if not os.getenv(key_name): - print(f"WARNING: {key_name} environment variable not set!") - print(" Set it in your .env file or export it before running.") - print() + log("RUN", f"Saving results to {results_dir}/") - # Determine which adapter to use - task_ids = None - if args.mock: - # User explicitly requested mock adapter - print(" Adapter: Mock (forced by --mock flag)") - print() - adapter = WAAMockAdapter(num_tasks=args.tasks, domains=["browser", "office"]) - else: - # Auto-detect WAA or use explicit path - waa_path = None - if args.waa_path: - # Explicit path provided - waa_path = Path(args.waa_path) - if not waa_path.exists(): - print(f"ERROR: WAA path does not exist: {waa_path}") - sys.exit(1) - else: - # Try to auto-detect - waa_path = find_waa_path() + # Create tarball of results inside container + log("RUN", "Creating results archive...") + tar_cmd = "docker exec winarena tar -czvf /tmp/results.tar.gz -C /client/results . 2>/dev/null" + result = subprocess.run( + ["ssh", *SSH_OPTS, f"azureuser@{ip}", tar_cmd], capture_output=True, text=True + ) - if waa_path: - # Real WAA available - if sys.platform != "win32" and not args.force: - print(f" Adapter: WAA (detected at {waa_path})") - print( - "ERROR: WAA requires Windows. Use --mock to use mock adapter instead." - ) - sys.exit(1) + if result.returncode != 0: + log( + "RUN", + f"Warning: Failed to create archive: {result.stderr[:200] if result.stderr else 'unknown'}", + ) + log("RUN", "Trying direct copy...") - from openadapt_ml.benchmarks import WAAAdapter + # Try copying results directory directly + copy_cmd = "docker cp winarena:/client/results/. /tmp/waa-results/" + subprocess.run( + [ + "ssh", + *SSH_OPTS, + f"azureuser@{ip}", + f"rm -rf /tmp/waa-results && mkdir -p /tmp/waa-results && {copy_cmd}", + ], + capture_output=True, + ) - print(f" Adapter: WAA (real, from {waa_path})") - print() - adapter = WAAAdapter(waa_repo_path=waa_path) - if args.task_ids: - task_ids = [t.strip() for t in args.task_ids.split(",")] + # Download via scp + scp_result = subprocess.run( + [ + "scp", + "-r", + *SSH_OPTS, + f"azureuser@{ip}:/tmp/waa-results/*", + str(results_dir), + ], + capture_output=True, + text=True, + ) + if scp_result.returncode == 0: + log("RUN", f"Results saved to: {results_dir}") + return str(results_dir) else: - # WAA not found, fall back to mock - print(" Adapter: Mock (WAA not found)") - print( - " Note: To use real WAA, run: git submodule update --init --recursive" - ) - print( - " Or specify with: --waa-path /path/to/WindowsAgentArena" - ) - print() - adapter = WAAMockAdapter( - num_tasks=args.tasks, domains=["browser", "office"] + log( + "RUN", + f"Warning: Failed to download results: {scp_result.stderr[:200] if scp_result.stderr else 'unknown'}", ) + return None - # Create API-backed agent - agent = APIBenchmarkAgent( - provider=args.provider, - max_tokens=args.max_tokens, - use_accessibility_tree=not args.no_a11y, - use_history=not args.no_history, + # Copy tarball from container to VM host + copy_tar_cmd = "docker cp winarena:/tmp/results.tar.gz /tmp/results.tar.gz" + subprocess.run( + ["ssh", *SSH_OPTS, f"azureuser@{ip}", copy_tar_cmd], capture_output=True ) - # Configure evaluation - model_id = args.model_id if args.model_id else f"{args.provider}-api" - config = EvaluationConfig( - max_steps=args.max_steps, - parallel=1, # API calls should be sequential to avoid rate limits - save_trajectories=True, - save_execution_traces=True, - model_id=model_id, - output_dir=args.output, - run_name=args.run_name, - verbose=args.verbose, + # Download tarball + local_tar = results_dir / "results.tar.gz" + scp_result = subprocess.run( + ["scp", *SSH_OPTS, f"azureuser@{ip}:/tmp/results.tar.gz", str(local_tar)], + capture_output=True, + text=True, ) - # Run evaluation - print("Starting evaluation...") - print(" (Each step calls the API - this may take a while)") - print() + if scp_result.returncode != 0: + log( + "RUN", + f"Warning: Failed to download tarball: {scp_result.stderr[:200] if scp_result.stderr else 'unknown'}", + ) + return None + + # Extract tarball + log("RUN", "Extracting results...") + import tarfile try: - results = evaluate_agent_on_benchmark( - agent=agent, - adapter=adapter, - task_ids=task_ids, - config=config, - ) + with tarfile.open(local_tar, "r:gz") as tar: + tar.extractall(path=results_dir) + local_tar.unlink() # Remove tarball after extraction except Exception as e: - print(f"\nERROR: {e}") - if "API key" in str(e) or "api_key" in str(e).lower(): - print(f"\nMake sure {key_name} is set in your environment.") - sys.exit(1) - - # Print results - metrics = compute_metrics(results) - print("\n=== Results ===") - print(f"Tasks: {metrics['num_tasks']}") - print(f"Success rate: {metrics['success_rate']:.1%}") - print(f"Successes: {metrics['success_count']}") - print(f"Failures: {metrics['fail_count']}") - print(f"Avg score: {metrics['avg_score']:.3f}") - print(f"Avg steps: {metrics['avg_steps']:.1f}") - print() - - # Domain breakdown - tasks = adapter.list_tasks() - domain_metrics = compute_domain_metrics(results, tasks) - if domain_metrics: - print("=== By Domain ===") - for domain, dm in domain_metrics.items(): - print( - f" {domain}: {dm['success_rate']:.1%} ({dm['success_count']}/{dm['num_tasks']})" - ) - print() + log("RUN", f"Warning: Failed to extract: {e}") + log("RUN", f"Tarball saved at: {local_tar}") - # Find output directory - output_dir = Path(args.output) - run_dirs = sorted( - output_dir.glob("*/metadata.json"), - key=lambda p: p.stat().st_mtime, - reverse=True, + # Clean up remote tarball + subprocess.run( + ["ssh", *SSH_OPTS, f"azureuser@{ip}", "rm -f /tmp/results.tar.gz"], + capture_output=True, ) - if run_dirs: - run_dir = run_dirs[0].parent - print(f"Results saved to: {run_dir.absolute()}") - print("View with: uv run python -m openadapt_ml.cloud.local serve --open") - print() + # List what we downloaded + result_files = list(results_dir.glob("**/*")) + log("RUN", f"Downloaded {len(result_files)} files to {results_dir}/") -def cmd_create_config(args: argparse.Namespace) -> None: - """Create a sample Azure config file.""" - from openadapt_ml.benchmarks.azure import AzureConfig + # Show summary if available + summary_file = results_dir / "summary.json" + if summary_file.exists(): + import json - config = AzureConfig( - subscription_id="", - resource_group="agents", - workspace_name="agents_ml", - vm_size="Standard_D8ds_v5", # 300GB temp storage for WAA - ) + try: + with open(summary_file) as f: + summary = json.load(f) + log("RUN", f"Summary: {json.dumps(summary, indent=2)[:500]}") + except Exception: + pass - output_path = Path(args.output) - config.to_json(output_path) - print(f"Sample config saved to: {output_path}") - print("\nEdit this file with your Azure credentials before using.") + return str(results_dir) -def cmd_status(args: argparse.Namespace) -> None: - """Check Azure workspace and compute status.""" - setup_logging(args.verbose) +def cmd_download(args): + """Download benchmark results from VM.""" + init_logging() - # Import after logging setup to suppress Azure SDK noise - from openadapt_ml.benchmarks.azure import AzureConfig, AzureMLClient # noqa: E402 + ip = get_vm_ip() + if not ip: + log("DOWNLOAD", "ERROR: VM not found") + return 1 - print("\n=== Azure WAA Status ===\n") + log("DOWNLOAD", "Downloading benchmark results...") + result_path = download_benchmark_results(ip) - # Check config - try: - config = AzureConfig.from_env() - print(f"Subscription: {config.subscription_id[:8]}...") - print(f"Resource Group: {config.resource_group}") - print(f"Workspace: {config.workspace_name}") - print(f"VM Size: {config.vm_size}") - except ValueError as e: - print(f"Config Error: {e}") - print("\nRun 'python scripts/setup_azure.py' to configure.") - return - - # Check WAA - waa_path = find_waa_path() - if waa_path: - print(f"WAA Path: {waa_path}") + if result_path: + log("DOWNLOAD", f"Results saved to: {result_path}") + return 0 else: - print("WAA Path: NOT FOUND") - print(" Run: git submodule update --init --recursive") - - # Check Azure connection - print("\nConnecting to Azure...") - try: - client = AzureMLClient(config) - computes = client.list_compute_instances(prefix="w") - print("Connection: OK") - - if computes: - print(f"\nActive Compute Instances ({len(computes)}):") - for name in computes: - try: - status = client.get_compute_status(name) - print(f" - {name}: {status}") - except Exception: - print(f" - {name}: (status unknown)") - else: - print("\nNo active compute instances.") + log("DOWNLOAD", "Failed to download results") + return 1 - except Exception as e: - print("Connection: FAILED") - print(f" Error: {e}") - print() +def cmd_analyze(args): + """Analyze benchmark results from downloaded logs.""" + import re + from collections import defaultdict + results_dir = ( + Path(args.results_dir) if args.results_dir else Path("benchmark_results") + ) -def cmd_cleanup(args: argparse.Namespace) -> None: - """Clean up all Azure compute resources.""" - setup_logging(args.verbose) + # Find most recent results if no specific dir given + if args.results_dir: + target_dir = Path(args.results_dir) + else: + dirs = sorted(results_dir.glob("waa_results_*"), reverse=True) + if not dirs: + print("No results found in benchmark_results/") + print("Run 'cli download' first to get results from VM") + return 1 + target_dir = dirs[0] + + print(f"Analyzing: {target_dir}") + print("=" * 60) - from openadapt_ml.benchmarks.azure import AzureConfig, AzureMLClient + # Find log files + log_files = list(target_dir.glob("logs/normal-*.log")) + if not log_files: + print("No log files found") + return 1 + + # Parse results + tasks = [] + current_task = None + pending_domain = None + + for log_file in sorted(log_files): + with open(log_file) as f: + for line in f: + # Strip ANSI codes + clean = re.sub(r"\x1b\[[0-9;]*m", "", line) + + # Domain comes before Example ID + if "[Domain]:" in clean: + match = re.search(r"\[Domain\]: (.+)", clean) + if match: + pending_domain = match.group(1).strip() + + # Task start (Example ID comes after Domain) + if "[Example ID]:" in clean: + match = re.search(r"\[Example ID\]: (.+)", clean) + if match: + current_task = { + "id": match.group(1).strip(), + "domain": pending_domain, + "reward": None, + "error": None, + } + pending_domain = None + + # Task result + if "Reward:" in clean and current_task: + match = re.search(r"Reward: ([0-9.]+)", clean) + if match: + current_task["reward"] = float(match.group(1)) + tasks.append(current_task) + current_task = None + + # Task error + if "Exception in" in clean and current_task: + match = re.search(r"Exception in .+: (.+)", clean) + if match: + current_task["error"] = match.group(1).strip() + current_task["reward"] = 0.0 + tasks.append(current_task) + current_task = None - print("\n=== Azure WAA Cleanup ===\n") + # Summary + print(f"\nTotal tasks attempted: {len(tasks)}") - try: - config = AzureConfig.from_env() - except ValueError as e: - print(f"Config Error: {e}") - return + if not tasks: + print("No completed tasks found") + return 0 - print(f"Workspace: {config.workspace_name}") - print(f"Resource Group: {config.resource_group}") - print() + # Success rate + successes = sum(1 for t in tasks if t["reward"] and t["reward"] > 0) + print(f"Successful: {successes} ({100 * successes / len(tasks):.1f}%)") - client = AzureMLClient(config) + # By domain + by_domain = defaultdict(list) + for t in tasks: + by_domain[t["domain"] or "unknown"].append(t) - # List ALL compute instances (no prefix filter) - print("Finding all compute instances...") - computes = client.list_compute_instances() # No prefix = get all + print("\nBy domain:") + for domain in sorted(by_domain.keys()): + domain_tasks = by_domain[domain] + domain_success = sum(1 for t in domain_tasks if t["reward"] and t["reward"] > 0) + print( + f" {domain}: {domain_success}/{len(domain_tasks)} ({100 * domain_success / len(domain_tasks):.1f}%)" + ) - if not computes: - print(" No compute instances found") - else: - print(f" Found {len(computes)} compute instance(s):") - for name in computes: - try: - status = client.get_compute_status(name) - except Exception: - status = "unknown" - print(f" - {name} ({status})") + # Errors + errors = [t for t in tasks if t.get("error")] + if errors: + print(f"\nErrors ({len(errors)}):") + for t in errors[:5]: # Show first 5 + print(f" {t['id']}: {t['error'][:50]}") + if len(errors) > 5: + print(f" ... and {len(errors) - 5} more") - print() - for name in computes: - if not args.yes: - confirm = input(f" Delete '{name}'? [y/N]: ").strip().lower() - if confirm != "y": - print(f" Skipped {name}") - continue - print(f" Deleting {name}...", end="", flush=True) - try: - client.delete_compute_instance(name) - print(" done") - except Exception as e: - print(f" FAILED: {e}") - - print("\nCleanup complete.") - print("Note: Resource deletion may take a few minutes to free quota.") - print() + return 0 -def cmd_cleanup_vms(args: argparse.Namespace) -> None: - """Clean up Azure compute instances to free quota.""" - import subprocess +def cmd_tasks(args): + """List available WAA benchmark tasks.""" + ip = get_vm_ip() + if not ip: + print("ERROR: VM not found") + return 1 - print("\n=== Cleaning up Azure Compute Instances ===\n") + print("Fetching available tasks from WAA container...") + print("-" * 60) - # List current VMs + # Get list of domains (subdirectories in examples/) result = subprocess.run( [ - "az", - "ml", - "compute", - "list", - "--resource-group", - args.resource_group, - "--workspace-name", - args.workspace, - "--query", - "[].name", - "-o", - "tsv", + "ssh", + *SSH_OPTS, + f"azureuser@{ip}", + "docker exec winarena ls /client/evaluation_examples_windows/examples/", ], capture_output=True, text=True, ) if result.returncode != 0: - print(f"Error listing VMs: {result.stderr}") - sys.exit(1) - - vms = [v.strip() for v in result.stdout.strip().split("\n") if v.strip()] - - if not vms: - print("No compute instances found.") - return + print("ERROR: Could not fetch domain list") + return 1 - print(f"Found {len(vms)} compute instance(s):") - for vm in vms: - print(f" - {vm}") - print() + domains = result.stdout.strip().split("\n") - if not args.yes: - response = input(f"Delete all {len(vms)} VM(s)? [y/N] ") - if response.lower() != "y": - print("Aborted.") - return + # Count tasks per domain + domain_tasks = {} + total_tasks = 0 - for vm in vms: - print(f"Deleting {vm}...", end=" ", flush=True) - del_result = subprocess.run( + for domain in domains: + if not domain: + continue + count_result = subprocess.run( [ - "az", - "ml", - "compute", - "delete", - "--name", - vm, - "--resource-group", - args.resource_group, - "--workspace-name", - args.workspace, - "--yes", + "ssh", + *SSH_OPTS, + f"azureuser@{ip}", + f"docker exec winarena ls /client/evaluation_examples_windows/examples/{domain}/ 2>/dev/null | wc -l", ], capture_output=True, text=True, ) - if del_result.returncode == 0: - print("done") - else: - print(f"failed: {del_result.stderr[:100]}") + count = ( + int(count_result.stdout.strip()) + if count_result.stdout.strip().isdigit() + else 0 + ) + domain_tasks[domain] = count + total_tasks += count + + # Print summary + print(f"Total tasks: {total_tasks}") + print(f"Domains: {len(domains)}") + print() + + # Print by domain + for domain in sorted(domain_tasks.keys()): + count = domain_tasks[domain] + print(f" {domain}: {count} tasks") + + if args.verbose and count > 0: + # List actual task IDs + tasks_result = subprocess.run( + [ + "ssh", + *SSH_OPTS, + f"azureuser@{ip}", + f"docker exec winarena ls /client/evaluation_examples_windows/examples/{domain}/", + ], + capture_output=True, + text=True, + ) + for task_file in tasks_result.stdout.strip().split("\n")[:5]: # Limit to 5 + task_id = task_file.replace(".json", "") + print(f" - {task_id}") + if count > 5: + print(f" ... and {count - 5} more") - print("\nCleanup complete. Quota should be freed within a few minutes.") + print() + print("Usage examples:") + print(" Run all notepad tasks: cli_v2 run --domain notepad") + print(" Run all chrome tasks: cli_v2 run --domain chrome") + print( + " Run specific task: cli_v2 run --task 366de66e-cbae-4d72-b042-26390db2b145-WOS" + ) + return 0 -def cmd_list_jobs(args: argparse.Namespace) -> None: - """List recent Azure ML jobs.""" - import subprocess - print("\n=== Recent Azure ML Jobs ===\n") +def cmd_deallocate(args): + """Stop VM (preserves disk, stops billing).""" + init_logging() + log("DEALLOCATE", f"Deallocating VM '{VM_NAME}'...") result = subprocess.run( - [ - "az", - "ml", - "job", - "list", - "--resource-group", - args.resource_group, - "--workspace-name", - args.workspace, - "-o", - "table", - ], + ["az", "vm", "deallocate", "-g", RESOURCE_GROUP, "-n", VM_NAME], capture_output=True, text=True, ) - if result.returncode != 0: - print(f"Error: {result.stderr}") - sys.exit(1) + if result.returncode == 0: + log("DEALLOCATE", "VM deallocated (billing stopped)") + log("DEALLOCATE", "Use 'az vm start' to resume") + return 0 + else: + log("DEALLOCATE", f"ERROR: {result.stderr}") + return 1 - # Filter out experimental warnings - lines = [ - line for line in result.stdout.split("\n") if "experimental" not in line.lower() - ] - print("\n".join(lines[: args.limit + 3])) # +3 for header rows +def cmd_exec(args): + """Run command on VM host.""" + ip = get_vm_ip() + if not ip: + print("ERROR: VM not found or not running") + return 1 -def cmd_job_logs(args: argparse.Namespace) -> None: - """Download and display logs for an Azure ML job.""" - import subprocess - import tempfile + cmd = args.cmd + if not cmd: + print("ERROR: --cmd is required") + return 1 - print(f"\n=== Fetching logs for job: {args.job_name} ===\n") + result = ssh_run(ip, cmd, stream=True) + return result.returncode - with tempfile.TemporaryDirectory() as tmpdir: - result = subprocess.run( - [ - "az", - "ml", - "job", - "download", - "--name", - args.job_name, - "--resource-group", - args.resource_group, - "--workspace-name", - args.workspace, - "--download-path", - tmpdir, - "--all", - ], - capture_output=True, - text=True, - ) - if result.returncode != 0: - print(f"Error: {result.stderr}") - sys.exit(1) +def cmd_docker_exec(args): + """Run command inside winarena container.""" + ip = get_vm_ip() + if not ip: + print("ERROR: VM not found or not running") + return 1 - # Find and display logs - log_files = [ - f"{tmpdir}/artifacts/user_logs/std_log.txt", - f"{tmpdir}/artifacts/system_logs/lifecycler/execution-wrapper.log", - ] + cmd = args.cmd + if not cmd: + print("ERROR: --cmd is required") + return 1 - for log_file in log_files: - if Path(log_file).exists(): - print(f"=== {Path(log_file).name} ===") - with open(log_file) as f: - content = f.read() - if content.strip(): - print(content[:5000]) # Limit output - if len(content) > 5000: - print(f"\n... (truncated, full log at {log_file})") - else: - print("(empty)") - print() + docker_cmd = f"docker exec winarena {cmd}" + result = ssh_run(ip, docker_cmd, stream=True) + return result.returncode -def get_vm_ip(resource_group: str, vm_name: str) -> str | None: - """Get the public IP address of an Azure VM. +def cmd_vnc(args): + """Open VNC to view Windows desktop via SSH tunnel.""" + ip = get_vm_ip() + if not ip: + print("ERROR: VM not found or not running") + return 1 - Args: - resource_group: Azure resource group name - vm_name: Name of the VM + print(f"Setting up SSH tunnel to VM ({ip})...") + print("VNC will be available at: http://localhost:8006") + print("-" * 60) - Returns: - Public IP address or None if VM not found/running - """ - import subprocess + # Kill any existing tunnel on port 8006 + subprocess.run(["pkill", "-f", "ssh.*8006:localhost:8006"], capture_output=True) - result = subprocess.run( - [ - "az", - "vm", - "show", - "-d", - "-g", - resource_group, - "-n", - vm_name, - "--query", - "publicIps", - "-o", - "tsv", - ], - capture_output=True, - text=True, + # Start SSH tunnel in background + tunnel_proc = subprocess.Popen( + ["ssh", *SSH_OPTS, "-N", "-L", "8006:localhost:8006", f"azureuser@{ip}"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) - if result.returncode == 0 and result.stdout.strip(): - return result.stdout.strip() - return None + # Give tunnel a moment to establish + time.sleep(2) -def cleanup_waa_resources(resource_group: str, vm_name: str) -> None: - """Clean up leftover Azure resources from a VM. + # Check if tunnel is running + if tunnel_proc.poll() is not None: + print("ERROR: SSH tunnel failed to start") + return 1 - When VM deletion fails or is incomplete, resources like VNETs, NICs, NSGs, - PublicIPs, and OS disks may be left behind, blocking new VM creation. + print(f"SSH tunnel established (PID: {tunnel_proc.pid})") - This function deletes all resources with names starting with the VM name - in the correct order: - 1. NICs first (depend on VNET, NSG, PublicIP) - 2. VNETs, NSGs, PublicIPs (can be deleted in parallel) - 3. OS disks last + # Open browser + import webbrowser - Args: - resource_group: Azure resource group name - vm_name: Base name of the VM (e.g., "waa-eval-vm") - """ - import subprocess + vnc_url = "http://localhost:8006" + print(f"Opening {vnc_url} in browser...") + webbrowser.open(vnc_url) - print(f" Cleaning up leftover resources for {vm_name}...") + print() + print("VNC is now accessible at: http://localhost:8006") + print("Press Ctrl+C to close the tunnel") + print("-" * 60) - # List all resources in the resource group that match the VM name prefix - result = subprocess.run( - [ - "az", "resource", "list", - "-g", resource_group, - "--query", f"[?starts_with(name, '{vm_name}')].[name, type]", - "-o", "tsv", - ], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print(f" Warning: Could not list resources: {result.stderr[:100]}") - return - - if not result.stdout.strip(): - print(" No leftover resources found") - return - - # Parse resources and categorize by type - resources = [] - for line in result.stdout.strip().split("\n"): - if "\t" in line: - name, res_type = line.split("\t", 1) - resources.append((name.strip(), res_type.strip())) - - if not resources: - print(" No leftover resources found") - return - - print(f" Found {len(resources)} resource(s) to clean up:") - for name, res_type in resources: - short_type = res_type.split("/")[-1] if "/" in res_type else res_type - print(f" - {name} ({short_type})") - - # Delete in correct order: NICs first, then VNET/NSG/PublicIP, then disks - # Order matters because NICs depend on other resources - type_order = [ - "Microsoft.Network/networkInterfaces", - "Microsoft.Network/virtualNetworks", - "Microsoft.Network/networkSecurityGroups", - "Microsoft.Network/publicIPAddresses", - "Microsoft.Compute/disks", - ] - - for target_type in type_order: - for name, res_type in resources: - if res_type == target_type: - short_type = res_type.split("/")[-1] - print(f" Deleting {name} ({short_type})...", end="", flush=True) - del_result = subprocess.run( - [ - "az", "resource", "delete", - "-g", resource_group, - "-n", name, - "--resource-type", res_type, - ], - capture_output=True, - text=True, - timeout=120, - ) - if del_result.returncode == 0: - print(" done") - else: - print(f" failed: {del_result.stderr[:50]}") - - # Handle any remaining resource types not in our order list - known_types = set(type_order) - for name, res_type in resources: - if res_type not in known_types: - short_type = res_type.split("/")[-1] if "/" in res_type else res_type - print(f" Deleting {name} ({short_type})...", end="", flush=True) - del_result = subprocess.run( - [ - "az", "resource", "delete", - "-g", resource_group, - "-n", name, - "--resource-type", res_type, - ], - capture_output=True, - text=True, - timeout=120, - ) - if del_result.returncode == 0: - print(" done") - else: - print(f" failed: {del_result.stderr[:50]}") - - print(" Resource cleanup complete") - - -def ensure_docker_running(ip: str) -> bool: - """Ensure Docker daemon is running on the VM. - - After VM restart, Docker may not auto-start. This function checks - and starts Docker if needed. - - Args: - ip: Public IP address of the Azure VM - - Returns: - True if Docker is running, False on failure - """ - import subprocess - - # Check if Docker is running - check_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - "-o", - "ConnectTimeout=10", - f"azureuser@{ip}", - "docker info 2>/dev/null | head -1", - ], - capture_output=True, - text=True, - timeout=30, - ) - - if "Client:" in check_result.stdout: - return True # Docker is already running - - # Docker not running, try to start it - print(" Docker not running, starting...") - start_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "sudo systemctl start docker && sleep 3 && docker info 2>/dev/null | head -1", - ], - capture_output=True, - text=True, - timeout=60, - ) - - if "Client:" in start_result.stdout: - print(" ✓ Docker started") - # Wait for Docker to be fully ready - import time - - time.sleep(5) - return True - - print(f" ✗ Failed to start Docker: {start_result.stderr[:200]}") - return False - - -def capture_vm_screenshot(ip: str, output_path: Path | str = None) -> Path | None: - """Capture a screenshot from the Windows VM via QEMU monitor. - - Args: - ip: Public IP address of the Azure VM - output_path: Path to save the screenshot. Defaults to training_output/current/vm_screenshot.png - - Returns: - Path to the saved screenshot, or None on failure - """ - import subprocess - import shlex - - if output_path is None: - output_path = Path("training_output/current/vm_screenshot.png") - output_path = Path(output_path) - - try: - # Take screenshot via QEMU QMP monitor (port 7200) and convert to PNG on VM - # dockurr/windows uses QMP protocol on port 7200 - # Use Python PIL to convert PPM to PNG (ImageMagick not installed in container) - screenshot_script = """ -printf '%s\\n' '{"execute": "qmp_capabilities"}' '{"execute": "screendump", "arguments": {"filename": "/tmp/screen.ppm"}}' | nc -q1 localhost 7200 > /dev/null 2>&1 -sleep 1 -python3 -c "from PIL import Image; Image.open('/tmp/screen.ppm').save('/tmp/screen.png')" 2>/dev/null && cat /tmp/screen.png | base64 -""" - result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - "-o", - "ConnectTimeout=10", - f"azureuser@{ip}", - f"docker exec winarena bash -c {shlex.quote(screenshot_script)}", - ], - capture_output=True, - text=True, - timeout=60, - ) - - if result.returncode == 0 and result.stdout.strip(): - # Decode base64 and save - import base64 - - png_data = base64.b64decode(result.stdout.strip()) - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_bytes(png_data) - return output_path - else: - logger.warning( - f"Screenshot capture failed: {result.stderr[:200] if result.stderr else 'No output'}" - ) - return None - except subprocess.TimeoutExpired: - logger.warning("Screenshot capture timed out") - return None - except Exception as e: - logger.warning(f"Screenshot capture error: {e}") - return None - - -def check_waa_probe( - ip: str, timeout: int = 5, internal_ip: str = "172.30.0.2" -) -> tuple[bool, str | None]: - """Check if the WAA /probe endpoint is responding. - - Args: - ip: Public IP address of the Azure VM - timeout: Connection timeout in seconds - internal_ip: Internal IP of the Windows VM inside QEMU. - 172.30.0.2 for dockurr/windows:latest - 20.20.20.21 for official windowsarena/winarena - - Returns: - Tuple of (is_ready, response_text) - """ - import subprocess - - try: - # Run curl from inside the Docker container, not the VM host - # Port 5000 is only accessible within Docker's network - result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - "-o", - "ConnectTimeout=5", - f"azureuser@{ip}", - f"docker exec winarena curl -s --connect-timeout {timeout} http://{internal_ip}:5000/probe 2>/dev/null", - ], - capture_output=True, - text=True, - timeout=30, - ) - response = result.stdout.strip() if result.stdout else None - return bool(response), response - except subprocess.TimeoutExpired: - return False, None - except Exception: - return False, None - - -def poll_waa_probe( - ip: str, max_attempts: int = 30, interval: int = 20, internal_ip: str = "172.30.0.2" -) -> bool: - """Poll the WAA /probe endpoint until it responds or timeout. - - Args: - ip: Public IP address of the Azure VM - max_attempts: Maximum number of polling attempts - interval: Seconds between attempts - internal_ip: Internal IP of the Windows VM inside QEMU - - Returns: - True if probe responded, False if timeout - """ - import time - - print( - f" Polling /probe endpoint at {internal_ip}:5000 (max {max_attempts * interval}s)..." - ) - print(" Monitor Windows at: http://localhost:8006 (VNC via SSH tunnel)") - print() - - for attempt in range(1, max_attempts + 1): - is_ready, response = check_waa_probe(ip, timeout=5, internal_ip=internal_ip) - if is_ready: - print(f"\n ✓ WAA server is READY after {attempt * interval}s") - print(f" Response: {response[:100] if response else '(empty)'}") - return True - print(f" [{attempt}/{max_attempts}] Not ready yet... waiting {interval}s") - time.sleep(interval) - - print(f"\n ✗ Timeout after {max_attempts * interval}s") - return False - - -def cmd_analyze(args: argparse.Namespace) -> None: - """Analyze WAA benchmark results and generate summary statistics. - - Can analyze results from: - 1. Local directory (--results-dir) - 2. Remote Azure VM via SSH (--vm-ip --remote) - faster, no download - 3. Remote Azure VM with download (--vm-ip) - downloads files first - - Outputs per-domain success rates and overall metrics. - """ - import subprocess - import tempfile - from datetime import datetime - - results_dir = args.results_dir - vm_ip = args.vm_ip - remote = getattr(args, "remote", False) - verbose = getattr(args, "verbose", False) - - # If --remote flag, run analysis via SSH on the VM - if vm_ip and remote: - print(f"Analyzing results on VM at {vm_ip} via SSH...") - remote_path = "/mnt/WindowsAgentArena/src/win-arena-container/client/results/pyautogui/a11y_tree" - - # Build SSH command to analyze results on VM - analysis_script = """ -import os -import json -from pathlib import Path - -results_path = Path("{remote_path}") -model_dirs = list(results_path.glob("*/0")) - -total_tasks = 0 -total_success = 0 -total_fail = 0 -total_incomplete = 0 -domain_stats = {{}} -successful_tasks = [] -failed_tasks = [] - -for model_dir in model_dirs: - model_name = model_dir.parent.name - for domain_dir in sorted(model_dir.iterdir()): - if not domain_dir.is_dir(): - continue - domain = domain_dir.name - tasks = [t for t in domain_dir.iterdir() if t.is_dir()] - success = fail = incomplete = 0 - for task_dir in tasks: - result_file = task_dir / "result.txt" - if result_file.exists(): - result = result_file.read_text().strip() - if result == "1.0": - success += 1 - successful_tasks.append(f"{{domain}}/{{task_dir.name}}") - else: - fail += 1 - failed_tasks.append(f"{{domain}}/{{task_dir.name}}") - else: - incomplete += 1 - total_tasks += len(tasks) - total_success += success - total_fail += fail - total_incomplete += incomplete - domain_stats[domain] = {{"total": len(tasks), "success": success, "fail": fail, "incomplete": incomplete}} - -result = {{ - "model": model_name if model_dirs else "unknown", - "total_tasks": total_tasks, - "evaluated": total_success + total_fail, - "success": total_success, - "fail": total_fail, - "incomplete": total_incomplete, - "success_rate": total_success / (total_success + total_fail) * 100 if (total_success + total_fail) > 0 else 0, - "domains": domain_stats, - "successful_tasks": successful_tasks, - "failed_tasks": failed_tasks -}} -print(json.dumps(result)) -""".format(remote_path=remote_path) - - try: - result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{vm_ip}", - f"python3 -c '{analysis_script}'", - ], - capture_output=True, - text=True, - timeout=30, - ) - - if result.returncode != 0: - print(f"SSH analysis failed: {result.stderr}") - return - - data = json.loads(result.stdout) - except subprocess.TimeoutExpired: - print("SSH timeout") - return - except json.JSONDecodeError as e: - print(f"Failed to parse results: {e}") - print(f"Output: {result.stdout[:500]}") - return - - # Display results - print("\n" + "=" * 60) - print("WAA BENCHMARK RESULTS ANALYSIS") - print("=" * 60) - print(f"\nModel: {data['model']}") - print("-" * 40) - - for domain, stats in sorted(data["domains"].items()): - status = "✓" if stats["success"] > 0 else "○" - rate = f"{stats['success']}/{stats['total']}" - print( - f" {status} {domain:20s} {rate:8s} ({stats['fail']} fail, {stats['incomplete']} incomplete)" - ) - - print("\n" + "=" * 60) - print("SUMMARY") - print("=" * 60) - print(f"Total tasks: {data['total_tasks']}") - print(f"Evaluated: {data['evaluated']}") - print(f"Incomplete: {data['incomplete']}") - print(f"Successful: {data['success']}") - print(f"Failed: {data['fail']}") - print(f"Success rate: {data['success_rate']:.1f}% (of evaluated)") - if data["total_tasks"] > 0: - print( - f"Completion rate: {data['evaluated'] / data['total_tasks'] * 100:.1f}%" - ) - - if verbose: - print("\n" + "-" * 40) - print("SUCCESSFUL TASKS:") - for task in data["successful_tasks"]: - print(f" ✓ {task}") - print("\nFAILED TASKS:") - for task in data["failed_tasks"]: - print(f" ✗ {task}") - - if args.output: - data["date"] = datetime.now().isoformat() - Path(args.output).write_text(json.dumps(data, indent=2)) - print(f"\nSummary saved to: {args.output}") - - return - - # If VM IP provided without --remote, fetch results from remote - if vm_ip and not results_dir: - print(f"Fetching results from VM at {vm_ip}...") - print("(Use --remote for faster analysis without downloading)") - remote_path = "/mnt/WindowsAgentArena/src/win-arena-container/client/results" - - # Create temp directory for results - results_dir = tempfile.mkdtemp(prefix="waa_results_") - print(f"Downloading to {results_dir}...") - - try: - subprocess.run( - [ - "scp", - "-r", - *SSH_OPTS, - f"azureuser@{vm_ip}:{remote_path}/pyautogui", - results_dir, - ], - check=True, - capture_output=True, - ) - results_dir = Path(results_dir) / "pyautogui" - except subprocess.CalledProcessError as e: - print(f"Failed to fetch results: {e}") - return - - if not results_dir: - print("Error: Provide --results-dir or --vm-ip") - return - - results_path = Path(results_dir) - if not results_path.exists(): - print(f"Error: Results directory not found: {results_path}") - return - - # Find the model results directory - # Structure: pyautogui/a11y_tree/{model}/0/{domain}/{task_id}/ - model_dirs = list(results_path.glob("a11y_tree/*/0")) - if not model_dirs: - # Try direct path - model_dirs = list(results_path.glob("*/0")) - if not model_dirs: - print(f"No model results found in {results_path}") - return - - print("\n" + "=" * 60) - print("WAA BENCHMARK RESULTS ANALYSIS") - print("=" * 60) - - total_tasks = 0 - total_success = 0 - total_fail = 0 - total_incomplete = 0 - domain_stats = {} - successful_tasks = [] - failed_tasks = [] - - for model_dir in model_dirs: - model_name = model_dir.parent.name - print(f"\nModel: {model_name}") - print("-" * 40) - - # Iterate through domains - for domain_dir in sorted(model_dir.iterdir()): - if not domain_dir.is_dir(): - continue - - domain = domain_dir.name - tasks = list(domain_dir.iterdir()) - task_count = len([t for t in tasks if t.is_dir()]) - - success = 0 - fail = 0 - incomplete = 0 - - for task_dir in tasks: - if not task_dir.is_dir(): - continue - - result_file = task_dir / "result.txt" - if result_file.exists(): - result = result_file.read_text().strip() - if result == "1.0": - success += 1 - successful_tasks.append(f"{domain}/{task_dir.name}") - else: - fail += 1 - failed_tasks.append(f"{domain}/{task_dir.name}") - else: - incomplete += 1 - - total_tasks += task_count - total_success += success - total_fail += fail - total_incomplete += incomplete - - domain_stats[domain] = { - "total": task_count, - "success": success, - "fail": fail, - "incomplete": incomplete, - } - - # Format output - status = "✓" if success > 0 else "○" - rate = f"{success}/{task_count}" if task_count > 0 else "0/0" - print( - f" {status} {domain:20s} {rate:8s} ({fail} fail, {incomplete} incomplete)" - ) - - # Summary - print("\n" + "=" * 60) - print("SUMMARY") - print("=" * 60) - evaluated = total_success + total_fail - print(f"Total tasks: {total_tasks}") - print(f"Evaluated: {evaluated}") - print(f"Incomplete: {total_incomplete}") - print(f"Successful: {total_success}") - print(f"Failed: {total_fail}") - if evaluated > 0: - print( - f"Success rate: {total_success / evaluated * 100:.1f}% (of evaluated)" - ) - if total_tasks > 0: - print(f"Completion rate: {evaluated / total_tasks * 100:.1f}%") - - if verbose: - print("\n" + "-" * 40) - print("SUCCESSFUL TASKS:") - for task in successful_tasks: - print(f" ✓ {task}") - print("\nFAILED TASKS:") - for task in failed_tasks: - print(f" ✗ {task}") - - # Save summary JSON if requested - if args.output: - summary = { - "date": datetime.now().isoformat(), - "model": model_name if model_dirs else "unknown", - "total_tasks": total_tasks, - "evaluated": evaluated, - "success": total_success, - "fail": total_fail, - "incomplete": total_incomplete, - "success_rate": total_success / evaluated * 100 if evaluated > 0 else 0, - "domains": domain_stats, - "successful_tasks": successful_tasks, - "failed_tasks": failed_tasks, - } - output_path = Path(args.output) - output_path.write_text(json.dumps(summary, indent=2)) - print(f"\nSummary saved to: {output_path}") - - -def launch_benchmark_viewer( - vm_ip: str, - port: int = 8765, - open_browser: bool = True, - internal_ip: str = "172.30.0.2", -) -> None: - """Launch the benchmark viewer for monitoring a running WAA benchmark. - - This starts the local dashboard server with VM IP configuration and - optionally opens the browser to the benchmark viewer page. - - Args: - vm_ip: IP address of the Azure VM running WAA - port: Port for local dashboard server (default: 8765) - open_browser: Whether to open browser automatically - internal_ip: Internal IP of Windows VM inside Docker - """ - import subprocess - import os - import sys - - print("\n=== Launching Benchmark Viewer ===\n") - print(f" VM IP: {vm_ip}") - print(f" Internal IP: {internal_ip}") - print(f" Local port: {port}") - print(f" Dashboard: http://localhost:{port}/benchmark.html") - print(" VNC available via button in viewer when VM is ready") - print() - - # Set environment variables for the server to use - os.environ["WAA_VM_IP"] = vm_ip - os.environ["WAA_INTERNAL_IP"] = internal_ip - - # Build the serve command - use --benchmark to generate benchmark.html - serve_cmd = [ - sys.executable, - "-m", - "openadapt_ml.cloud.local", - "serve", - "--port", - str(port), - "--quiet", - "--benchmark", - "benchmark_results", # This triggers benchmark.html generation - ] - if open_browser: - serve_cmd.append("--open") - - print(" Press Ctrl+C to stop\n") - - try: - # Run the server - subprocess.run(serve_cmd) - except KeyboardInterrupt: - print("\nViewer stopped") - - -def cmd_viewer(args: argparse.Namespace) -> None: - """Launch benchmark viewer for monitoring a running VM. - - Usage: - uv run python -m openadapt_ml.benchmarks.cli viewer --vm-ip 172.171.112.41 - - This starts the local server configured to poll the specified VM - for benchmark status and opens the browser. - """ - print("\n=== Deprecated Viewer ===\n") - print("benchmark.html is legacy and will be deprecated.") - print("Use `vm monitor` (azure_ops.html) for live VM status + VNC panel.") - - vm_ip = args.vm_ip - port = getattr(args, "port", 8765) - no_open = getattr(args, "no_open", False) - internal_ip = getattr(args, "internal_ip", "172.30.0.2") - - launch_benchmark_viewer( - vm_ip=vm_ip, port=port, open_browser=not no_open, internal_ip=internal_ip - ) - - -def cmd_vm(args: argparse.Namespace) -> None: - """Manage dedicated WAA eval VM with nested virtualization support. - - This creates a standalone Azure VM (not Azure ML compute) that supports - nested virtualization, which is required for running WAA's Windows VM - inside Docker/QEMU. - """ - import subprocess - - vm_name = args.name - resource_group = args.resource_group - vm_size = args.size - location = args.location - - if args.action == "list-sizes": - print( - f"\n=== Available VM Sizes with Nested Virtualization in {location} ===\n" - ) - print("Checking available D-series sizes (support nested virt)...") - - # Get available sizes - result = subprocess.run( - [ - "az", - "vm", - "list-skus", - "--location", - location, - "--size", - "Standard_D", - "--all", - "--output", - "table", - "--query", - "[?restrictions[?reasonCode=='NotAvailableForSubscription']==`[]`].{Name:name, vCPUs:capabilities[?name=='vCPUs'].value|[0], Memory:capabilities[?name=='MemoryGB'].value|[0]}", - ], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print(f"Error: {result.stderr}") - sys.exit(1) - - print(result.stdout) - print("\nRecommended sizes for WAA (support nested virt):") - print(" - Standard_D4s_v3 (4 vCPU, 16GB, 32GB temp) ~$0.19/hr") - print(" - Standard_D8s_v3 (8 vCPU, 32GB, 64GB temp) ~$0.38/hr") - print(" - Standard_D4ds_v5 (4 vCPU, 16GB, 150GB temp) ~$0.19/hr") - print(" - Standard_D8ds_v5 (8 vCPU, 32GB, 300GB temp) ~$0.38/hr [RECOMMENDED]") - print( - "\nTry different locations if sizes are unavailable: westus2, centralus, westeurope" - ) - return - - elif args.action == "create": - print(f"\n=== Creating WAA Eval VM: {vm_name} ===\n") - print(f" Resource Group: {resource_group}") - print(f" Location: {location}") - print(f" VM Size: {vm_size} (supports nested virtualization)") - print(" OS: Ubuntu 22.04 LTS") - print() - - # Check if VM already exists - check = subprocess.run( - ["az", "vm", "show", "-g", resource_group, "-n", vm_name, "-o", "json"], - capture_output=True, - text=True, - ) - if check.returncode == 0: - print( - f"✗ VM '{vm_name}' already exists. Use 'vm status' to check it or 'vm delete' first." - ) - sys.exit(1) - - print("Creating VM (this takes 2-3 minutes)...") - result = subprocess.run( - [ - "az", - "vm", - "create", - "--resource-group", - resource_group, - "--name", - vm_name, - "--location", - location, - "--image", - "Ubuntu2204", - "--size", - vm_size, - "--admin-username", - "azureuser", - "--generate-ssh-keys", - "--public-ip-sku", - "Standard", - ], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print(f"✗ Error creating VM: {result.stderr}") - sys.exit(1) - - # Parse output to get IP - import json - - vm_info = json.loads(result.stdout) - public_ip = vm_info.get("publicIpAddress", "unknown") - - print("\n✓ VM created successfully!") - print(f"\n Public IP: {public_ip}") - print(f" SSH command: ssh azureuser@{public_ip}") - print("\n Next steps:") - print( - " 1. SSH into the VM: uv run python -m openadapt_ml.benchmarks.cli vm ssh" - ) - print(" 2. Verify nested virt: egrep -c '(vmx|svm)' /proc/cpuinfo") - print(" 3. Install Docker and run WAA") - - elif args.action == "status": - print(f"\n=== WAA Eval VM Status: {vm_name} ===\n") - - result = subprocess.run( - [ - "az", - "vm", - "show", - "-d", - "-g", - resource_group, - "-n", - vm_name, - "--query", - "{name:name,powerState:powerState,publicIps:publicIps,size:hardwareProfile.vmSize}", - "-o", - "json", - ], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print(f"✗ VM '{vm_name}' not found in resource group '{resource_group}'") - print( - " Create it with: uv run python -m openadapt_ml.benchmarks.cli vm create" - ) - sys.exit(1) - - import json - - info = json.loads(result.stdout) - print(f" Name: {info.get('name')}") - print(f" State: {info.get('powerState')}") - print(f" Size: {info.get('size')}") - print(f" Public IP: {info.get('publicIps')}") - - if info.get("publicIps"): - print(f"\n SSH command: ssh azureuser@{info.get('publicIps')}") - - elif args.action == "ssh": - # Get IP and SSH - result = subprocess.run( - [ - "az", - "vm", - "show", - "-d", - "-g", - resource_group, - "-n", - vm_name, - "--query", - "publicIps", - "-o", - "tsv", - ], - capture_output=True, - text=True, - ) - - if result.returncode != 0 or not result.stdout.strip(): - print(f"✗ Could not get IP for VM '{vm_name}'. Is it running?") - sys.exit(1) - - ip = result.stdout.strip() - print(f"Connecting to {vm_name} at {ip}...") - import os - - # Use SSH_OPTS for consistent keepalive settings - os.execvp("ssh", ["ssh", *SSH_OPTS, f"azureuser@{ip}"]) - - elif args.action == "delete": - print(f"\n=== Deleting WAA Eval VM: {vm_name} ===\n") - - if not args.yes: - confirm = input(f"Are you sure you want to delete VM '{vm_name}'? (y/N): ") - if confirm.lower() != "y": - print("Cancelled.") - return - - print("Deleting VM and associated resources...") - result = subprocess.run( - ["az", "vm", "delete", "-g", resource_group, "-n", vm_name, "--yes"], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print(f"✗ Error deleting VM: {result.stderr}") - sys.exit(1) - - print(f"✓ VM '{vm_name}' deleted") - - elif args.action == "deallocate": - print(f"\n=== Deallocating VM: {vm_name} (stops billing, preserves disk) ===\n") - - result = subprocess.run( - [ - "az", - "vm", - "deallocate", - "-g", - resource_group, - "-n", - vm_name, - "--no-wait", - ], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print(f"✗ Error deallocating VM: {result.stderr}") - sys.exit(1) - - print(f"✓ VM '{vm_name}' deallocation initiated") - print("\n Cost savings: Deallocated VMs do not incur compute charges.") - print(" Storage costs still apply. Use 'vm delete' to stop all charges.") - print(" To restart: python -m openadapt_ml.benchmarks.cli vm start") - - elif args.action == "start": - import time - - print(f"\n=== Starting VM: {vm_name} ===\n") - - result = subprocess.run( - ["az", "vm", "start", "-g", resource_group, "-n", vm_name, "--no-wait"], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print(f"✗ Error starting VM: {result.stderr}") - sys.exit(1) - - print(f"✓ VM '{vm_name}' start initiated") - - # Wait for VM to be running and recover Docker (temp disk is wiped on deallocate) - print("\n Waiting for VM to be ready...") - ip = None - for _ in range(30): # Wait up to 60 seconds - time.sleep(2) - ip = get_vm_ip(resource_group, vm_name) - if ip: - # Check if SSH is accessible - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", "echo ready"], - capture_output=True, - text=True, - timeout=10, - ) - if result.returncode == 0: - break - - if ip: - print(f" ✓ VM is running (IP: {ip})") - - # Recover Docker - /mnt is wiped when VM is deallocated - print(" Checking Docker...") - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", "sudo docker ps 2>&1"], - capture_output=True, - text=True, - ) - - if ( - "Cannot connect to the Docker daemon" in result.stdout - or result.returncode != 0 - ): - print(" Docker not running. Recovering...") - # Create Docker directories (symlinked to /mnt which gets wiped) - # Kill any stale processes, create dirs, start services - recovery_cmd = """ - sudo pkill -9 dockerd 2>/dev/null - sudo pkill -9 containerd 2>/dev/null - sudo rm -f /var/run/docker.pid /var/run/containerd/containerd.pid - sudo mkdir -p /mnt/docker /mnt/containerd - sleep 2 - sudo systemctl reset-failed - sudo systemctl start containerd - sleep 3 - sudo systemctl start docker - """ - subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", recovery_cmd], - capture_output=True, - text=True, - ) - time.sleep(5) - - # Verify Docker is running - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", "sudo docker ps"], - capture_output=True, - text=True, - ) - if result.returncode == 0: - print(" ✓ Docker recovered and running") - else: - print(" ⚠ Docker may not be running. Check with 'vm diag'") - else: - print(" ✓ Docker is running") - else: - print("\n Use 'vm status' to check when the VM is running.") - - print(" Use 'vm monitor' to start dashboard when VM is ready.") - - elif args.action == "setup": - print(f"\n=== Setting up WAA Eval VM: {vm_name} ===\n") - - # Get VM IP - result = subprocess.run( - [ - "az", - "vm", - "show", - "-d", - "-g", - resource_group, - "-n", - vm_name, - "--query", - "publicIps", - "-o", - "tsv", - ], - capture_output=True, - text=True, - ) - if result.returncode != 0 or not result.stdout.strip(): - print( - f"✗ Could not get IP for VM '{vm_name}'. Create it first with 'vm create'" - ) - sys.exit(1) - - ip = result.stdout.strip() - print(f" VM IP: {ip}") - print("\n[1/3] Installing Docker...") - - # Install Docker - docker_cmd = ( - "sudo apt-get update -qq && " - "sudo apt-get install -y -qq docker.io && " - "sudo systemctl start docker && " - "sudo systemctl enable docker && " - "sudo usermod -aG docker $USER" - ) - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", docker_cmd], - capture_output=True, - text=True, - ) - if result.returncode != 0: - print(f"✗ Error installing Docker: {result.stderr}") - sys.exit(1) - print(" ✓ Docker installed") - - print("\n[2/3] Verifying nested virtualization...") - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", "egrep -c '(vmx|svm)' /proc/cpuinfo"], - capture_output=True, - text=True, - ) - cpu_count = result.stdout.strip() - if cpu_count and int(cpu_count) > 0: - print(f" ✓ Nested virt supported ({cpu_count} CPUs with vmx/svm)") - else: - print(" ⚠ Nested virt may not be supported") - - print("\n[3/3] Setup complete!") - print("\n Next: Pull WAA image with 'vm pull-image'") - print(" Or SSH in: uv run python -m openadapt_ml.benchmarks.cli vm ssh") - - elif args.action == "pull-image": - print(f"\n=== Pulling WAA Docker Image to VM: {vm_name} ===\n") - - acr_name = args.acr - acr_url = f"{acr_name}.azurecr.io" - image = f"{acr_url}/winarena:latest" - - # Get VM IP - result = subprocess.run( - [ - "az", - "vm", - "show", - "-d", - "-g", - resource_group, - "-n", - vm_name, - "--query", - "publicIps", - "-o", - "tsv", - ], - capture_output=True, - text=True, - ) - if result.returncode != 0 or not result.stdout.strip(): - print(f"✗ Could not get IP for VM '{vm_name}'") - sys.exit(1) - - ip = result.stdout.strip() - print(f" VM IP: {ip}") - print(f" Image: {image}") - - print("\n[1/2] Getting ACR access token...") - result = subprocess.run( - [ - "az", - "acr", - "login", - "--name", - acr_name, - "--expose-token", - "--query", - "accessToken", - "-o", - "tsv", - ], - capture_output=True, - text=True, - ) - if result.returncode != 0: - print(f"✗ Error getting ACR token: {result.stderr}") - sys.exit(1) - - token = result.stdout.strip() - print(" ✓ Got ACR token") - - print("\n[2/2] Logging into ACR and pulling image (this takes 5-10 minutes)...") - # Login to ACR on VM and pull - pull_cmd = f"sudo docker login {acr_url} -u 00000000-0000-0000-0000-000000000000 -p '{token}' && sudo docker pull {image}" - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", pull_cmd], - capture_output=False, # Show output live - ) - if result.returncode != 0: - print("\n✗ Error pulling image") - sys.exit(1) - - print("\n✓ WAA image pulled successfully!") - print(f"\n Image ready: {image}") - print(" Run WAA with: uv run python -m openadapt_ml.benchmarks.cli vm ssh") - - # NOTE: Deprecated actions removed (Jan 2026): - # - setup-waa: Replaced by top-level 'waa' command - # - prepare-windows: Replaced by top-level 'waa' command - # - run-waa: Replaced by top-level 'waa' command - # Use: uv run python -m openadapt_ml.benchmarks.cli waa --help - - # DEAD CODE REMOVED - more to clean up - - # Handle single worker (backward compatible) - if num_workers == 1: - # Get VM IP or create VM - result = subprocess.run( - [ - "az", - "vm", - "show", - "-d", - "-g", - resource_group, - "-n", - vm_name, - "--query", - "publicIps", - "-o", - "tsv", - ], - capture_output=True, - text=True, - ) - - vm_created = False - if result.returncode == 0 and result.stdout.strip(): - ip = result.stdout.strip() - print(f"[✓] VM already exists: {ip}") - vm_created = True - else: - print("[1/6] Creating Azure VM with nested virtualization...") - # Try multiple locations if needed - locations_to_try = [location, "westus2", "centralus", "eastus2"] - for loc in locations_to_try: - result = subprocess.run( - [ - "az", - "vm", - "create", - "--resource-group", - resource_group, - "--name", - vm_name, - "--location", - loc, - "--image", - "Ubuntu2204", - "--size", - "Standard_D8ds_v5", # v5 series supports nested virt - "--admin-username", - "azureuser", - "--generate-ssh-keys", - "--public-ip-sku", - "Standard", - ], - capture_output=True, - text=True, - ) - if result.returncode == 0: - import json as json_mod - - vm_info = json_mod.loads(result.stdout) - ip = vm_info.get("publicIpAddress", "") - print(f" ✓ VM created in {loc}: {ip}") - vm_created = True - break - else: - print(f" • {loc}: unavailable, trying next...") - - if not vm_created: - print("✗ Could not create VM in any region") - sys.exit(1) - - print("\n[2/6] Installing Docker with /mnt storage (300GB)...") - docker_cmds = [ - "sudo apt-get update -qq", - "sudo apt-get install -y -qq docker.io", - "sudo systemctl start docker", - "sudo systemctl enable docker", - "sudo usermod -aG docker $USER", - # Configure Docker to use larger /mnt disk - "sudo systemctl stop docker", - "sudo mkdir -p /mnt/docker", - # Configure Docker to use /mnt and enable BuildKit with cache limits - # keepBytes: max 30GB cache, gcPolicy: auto-prune when over limit - 'echo \'{"data-root": "/mnt/docker", "features": {"buildkit": true}}\' | sudo tee /etc/docker/daemon.json', - # Configure BuildKit garbage collection (30GB max cache) - "sudo mkdir -p /etc/buildkit", - 'echo \'[worker.oci]\\n gc = true\\n gckeepstorage = 30000000000\\n[[worker.oci.gcpolicy]]\\n keepBytes = 30000000000\\n keepDuration = 172800\\n filters = ["type==source.local", "type==exec.cachemount", "type==source.git.checkout"]\' | sudo tee /etc/buildkit/buildkitd.toml', - "sudo systemctl start docker", - ] - result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - "-o", - "ConnectTimeout=30", - f"azureuser@{ip}", - " && ".join(docker_cmds), - ], - capture_output=True, - text=True, - ) - if result.returncode != 0: - print( - f" ⚠ Docker setup warning: {result.stderr[:200] if result.stderr else 'unknown'}" - ) - else: - print(" ✓ Docker installed with /mnt storage") - - print("\n[3/6] Verifying nested virtualization...") - result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "egrep -c '(vmx|svm)' /proc/cpuinfo", - ], - capture_output=True, - text=True, - ) - cpu_count = result.stdout.strip() - if cpu_count and int(cpu_count) > 0: - print(f" ✓ Nested virt supported ({cpu_count} CPUs with vmx/svm)") - else: - print(" ✗ Nested virtualization not supported - WAA won't work") - sys.exit(1) - - print("\n[4/6] Pulling dockurr/windows image (for Windows VM)...") - # Use dockurr/windows directly - the ACR winarena image has broken dockur - result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "sudo docker pull dockurr/windows:latest 2>&1 | tail -5", - ], - capture_output=True, - text=True, - timeout=300, - ) - if result.returncode != 0: - print( - f" ⚠ Image pull warning: {result.stderr[:100] if result.stderr else ''}" - ) - print(" ✓ Windows image pulled") - - print("\n[5/6] Cloning WindowsAgentArena repository...") - result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "cd ~ && git clone --depth 1 https://github.com/microsoft/WindowsAgentArena.git 2>/dev/null || echo 'Already cloned'", - ], - capture_output=True, - text=True, - ) - print(" ✓ WAA repo cloned") - - print("\n[6/6] Creating WAA config file...") - api_key = args.api_key or settings.openai_api_key or "" - if not api_key: - print( - " ⚠ No API key provided. Set with --api-key, OPENAI_API_KEY env var, or in .env file" - ) - api_key = "placeholder-set-your-key" - - config_cmd = f'''cat > ~/WindowsAgentArena/config.json << 'EOF' -{{ - "OPENAI_API_KEY": "{api_key}", - "AZURE_API_KEY": "", - "AZURE_ENDPOINT": "" -}} -EOF''' - subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", config_cmd], - capture_output=True, - text=True, - ) - print(" ✓ Config created") - - print(f"\n{'=' * 60}") - print(" WAA Setup Complete!") - print(f"{'=' * 60}") - print(f"\n VM IP: {ip}") - print("\n Next step: Prepare Windows image (one-time, ~20 min):") - print(" uv run python -m openadapt_ml.benchmarks.cli waa --setup-only") - print("\n Or run WAA directly (will auto-prepare on first run):") - print( - " uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 5" - ) - - else: - # Multi-worker mode: create multiple VMs in parallel - print(f"[1/4] Creating {num_workers} Azure VMs in parallel...") - worker_names = [f"waa-worker-{i:02d}" for i in range(num_workers)] - - created_vms = [] - with ThreadPoolExecutor(max_workers=5) as executor: - futures = { - executor.submit(create_single_vm, name, location): name - for name in worker_names - } - for future in as_completed(futures): - name, result_loc = future.result() - if result_loc: - print(f" ✓ {name}: creation started in {result_loc}") - created_vms.append(name) - else: - print(f" ✗ {name}: FAILED") - - if not created_vms: - print("✗ Could not create any VMs") - sys.exit(1) - - # Wait for VMs to be ready and get IPs - print("\n[2/4] Waiting for VMs to get public IPs...") - import time as time_mod - - workers_with_ips = [] - for _ in range(30): # Wait up to 5 minutes - for name in created_vms: - if any(n == name for n, _ in workers_with_ips): - continue # Already got IP - result = subprocess.run( - [ - "az", - "vm", - "show", - "-d", - "-g", - resource_group, - "-n", - name, - "--query", - "publicIps", - "-o", - "tsv", - ], - capture_output=True, - text=True, - ) - if result.stdout.strip(): - workers_with_ips.append((name, result.stdout.strip())) - print(f" ✓ {name}: {result.stdout.strip()}") - else: - pass - - if len(workers_with_ips) == len(created_vms): - break - time_mod.sleep(10) - - if not workers_with_ips: - print("✗ No VMs got public IPs") - sys.exit(1) - - # Set up Docker and WAA on each VM in parallel - api_key = args.api_key or settings.openai_api_key or "" - if not api_key: - print( - " ⚠ No API key provided. Set with --api-key, OPENAI_API_KEY env var, or in .env file" - ) - api_key = "placeholder-set-your-key" - - print( - f"\n[3/4] Setting up Docker and WAA on {len(workers_with_ips)} VMs..." - ) - with ThreadPoolExecutor(max_workers=5) as executor: - futures = { - executor.submit(setup_single_vm, name, ip, api_key): name - for name, ip in workers_with_ips - } - for future in as_completed(futures): - name = futures[future] - success = future.result() - if success: - print(f" ✓ {name}: Docker + WAA configured") - else: - print(f" ✗ {name}: Setup failed") - - # Create pool registry - print("\n[4/4] Registering VM pool...") - registry = VMPoolRegistry() - pool = registry.create_pool( - workers=workers_with_ips, - resource_group=resource_group, - location=location, - vm_size="Standard_D8ds_v5", - ) - print( - f" ✓ Pool {pool.pool_id} registered with {len(pool.workers)} workers" - ) - - print(f"\n{'=' * 60}") - print(" Multi-Worker WAA Setup Complete!") - print(f"{'=' * 60}") - print(f"\n Workers: {len(workers_with_ips)}") - for name, ip in workers_with_ips: - print(f" - {name}: {ip}") - print("\n Next steps:") - print(" 1. Check pool status:") - print(" uv run python -m openadapt_ml.benchmarks.cli vm pool-status") - print(" 2. Prepare Windows on all workers (in parallel):") - print(" # Workers run waa command individually") - print(" 3. Run parallel benchmark:") - print( - " uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 30" - ) - - elif args.action == "fix-storage": - print("\n=== Fix WAA Storage (Move to /mnt for More Space) ===\n") - print( - "Moves WAA storage from root disk (~10GB free) to /mnt temp disk (~115GB free).\n" - ) - - # Get VM IP - result = subprocess.run( - [ - "az", - "vm", - "show", - "-d", - "-g", - resource_group, - "-n", - vm_name, - "--query", - "publicIps", - "-o", - "tsv", - ], - capture_output=True, - text=True, - ) - if result.returncode != 0 or not result.stdout.strip(): - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - ip = result.stdout.strip() - - print(f" VM IP: {ip}") - print() - - # Step 1: Check current storage - print("[1/4] Checking current storage situation...") - check_cmd = """ -df -h / /mnt 2>/dev/null | grep -E 'Filesystem|/dev' -echo '---' -docker inspect winarena --format='Storage: {{range .Mounts}}{{.Source}}{{end}}' 2>/dev/null || echo 'No container running' -""" - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", check_cmd], - capture_output=True, - text=True, - ) - print(result.stdout) - - # Step 2: Stop container - print("[2/4] Stopping WAA container...") - subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "docker stop winarena 2>/dev/null; docker rm winarena 2>/dev/null", - ], - capture_output=True, - text=True, - ) - print(" ✓ Container stopped") - - # Step 3: Move storage to /mnt - print("\n[3/4] Moving storage to /mnt (preserves Windows image)...") - move_cmd = """ -sudo mkdir -p /data/waa-storage -sudo chown azureuser:azureuser /data/waa-storage -# Move existing storage if any -if [ -d ~/waa-storage ]; then - mv ~/waa-storage/* /data/waa-storage/ 2>/dev/null - rm -rf ~/waa-storage - echo "Moved from ~/waa-storage" -fi -# Also check /home/azureuser/waa-storage explicitly -if [ -d /home/azureuser/waa-storage ]; then - mv /home/azureuser/waa-storage/* /data/waa-storage/ 2>/dev/null - rm -rf /home/azureuser/waa-storage - echo "Moved from /home/azureuser/waa-storage" -fi -ls -lh /data/waa-storage/ -""" - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", move_cmd], - capture_output=True, - text=True, - ) - print(result.stdout) - print(" ✓ Storage moved to /data/waa-storage") - - # Step 4: Restart container with new mount using vanilla WAA - print("\n[4/4] Restarting WAA container with /mnt storage...") - api_key = settings.openai_api_key or os.environ.get("OPENAI_API_KEY", "") - docker_cmd = f"""docker run -d \\ - --name winarena \\ - --device=/dev/kvm \\ - --cap-add NET_ADMIN \\ - --stop-timeout 120 \\ - -p 8006:8006 \\ - -p 3389:3389 \\ - -v /data/waa-storage:/storage \\ - -e VERSION=11e \\ - -e RAM_SIZE=12G \\ - -e CPU_CORES=4 \\ - -e OPENAI_API_KEY='{api_key}' \\ - --entrypoint /bin/bash \\ - windowsarena/winarena:latest \\ - -c './entry.sh --prepare-image false --start-client true --agent navi --model gpt-4o --som-origin oss --a11y-backend uia'""" - - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", docker_cmd], - capture_output=True, - text=True, - timeout=60, - ) - if result.returncode != 0: - print(f" ✗ Failed to start container: {result.stderr}") - sys.exit(1) - print(" ✓ WAA container restarted with /mnt storage") - - print(f"\n{'=' * 60}") - print(" Storage Fixed!") - print(f"{'=' * 60}") - print("\n Storage now on /mnt: ~115GB available") - print(" VNC: http://localhost:8006 (via SSH tunnel)") - print("\n If Windows was installing, it will resume automatically.") - print(" Monitor: uv run python -m openadapt_ml.benchmarks.cli vm status") - - elif args.action == "docker-prune": - print("\n=== Docker Cleanup (Free Disk Space) ===\n") - - # Get VM IP - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - - print(f" VM IP: {ip}") - print() - - # Check disk space before - print("[1/4] Current disk usage...") - df_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "df -h / /mnt 2>/dev/null | grep -E 'Filesystem|/dev'", - ], - capture_output=True, - text=True, - ) - print(f" {df_result.stdout}") - - # Docker system prune - print("[2/4] Cleaning Docker (images, containers, build cache)...") - prune_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "docker system prune -af --volumes 2>&1", - ], - capture_output=True, - text=True, - timeout=300, - ) - if prune_result.returncode == 0: - # Extract space reclaimed - output = prune_result.stdout - print(" ✓ Docker cleanup complete") - if "Total reclaimed space" in output: - for line in output.split("\n"): - if "Total reclaimed space" in line: - print(f" {line.strip()}") - else: - print(f" Warning: {prune_result.stderr[:200]}") - - # Deep cleanup: containerd snapshotter and buildkit cache - # These can accumulate even after docker prune - print("[3/4] Deep cleanup (containerd snapshotter, buildkit)...") - deep_clean_cmd = """ -# Stop services to release file locks -sudo systemctl stop docker.socket docker.service containerd.service 2>/dev/null -sleep 2 -# Kill any remaining containerd processes -sudo pkill -9 containerd 2>/dev/null || true -sleep 1 -# Clean containerd overlayfs snapshots (can be 30+ GB) -sudo rm -rf /mnt/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/* 2>/dev/null || true -sudo rm -rf /mnt/containerd/io.containerd.content.v1.content/blobs/* 2>/dev/null || true -# Clean buildkit cache -sudo rm -rf /mnt/docker/buildkit/containerd-overlayfs 2>/dev/null || true -# Restart Docker -sudo systemctl start docker 2>/dev/null -echo "deep_clean_done" -""" - deep_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - deep_clean_cmd, - ], - capture_output=True, - text=True, - timeout=120, - ) - if "deep_clean_done" in deep_result.stdout: - print(" ✓ Deep cleanup complete") - else: - print(f" Warning: Deep cleanup may have failed") - - # Check disk space after - print("[4/4] Disk usage after cleanup...") - df_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "df -h / /mnt 2>/dev/null | grep -E 'Filesystem|/dev'", - ], - capture_output=True, - text=True, - ) - print(f" {df_result.stdout}") - - # Configure BuildKit GC to prevent future cache bloat - print("[Bonus] Configuring BuildKit garbage collection (30GB limit)...") - buildkit_config = ( - "[worker.oci]\\n" - " gc = true\\n" - " gckeepstorage = 30000000000\\n" - "[[worker.oci.gcpolicy]]\\n" - " keepBytes = 30000000000\\n" - " keepDuration = 172800\\n" - ' filters = ["type==source.local", "type==exec.cachemount", "type==source.git.checkout"]' - ) - gc_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - f"sudo mkdir -p /etc/buildkit && echo -e '{buildkit_config}' | sudo tee /etc/buildkit/buildkitd.toml >/dev/null && echo 'configured'", - ], - capture_output=True, - text=True, - timeout=30, - ) - if gc_result.returncode == 0 and "configured" in gc_result.stdout: - print(" ✓ BuildKit GC configured (max 30GB cache)") - else: - print(" Warning: Could not configure BuildKit GC") - - print( - "\n Retry build: uv run python -m openadapt_ml.benchmarks.cli waa --rebuild" - ) - - elif args.action == "docker-move": - print("\n=== Move Docker Data to /mnt (300GB) ===\n") - print("Reconfigures Docker to use /mnt/docker for all images and layers.") - print("This solves 'no space left on device' errors during docker build.\n") - - # Get VM IP - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - - print(f" VM IP: {ip}") - print() - - # Step 1: Check current disk usage - print("[1/5] Current disk usage...") - df_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "df -h / /mnt 2>/dev/null | grep -E 'Filesystem|/dev'", - ], - capture_output=True, - text=True, - ) - print(f" {df_result.stdout}") - - # Step 2: Stop Docker and containerd - print("[2/5] Stopping Docker and containerd...") - stop_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "sudo systemctl stop docker docker.socket containerd 2>&1 && echo 'stopped'", - ], - capture_output=True, - text=True, - timeout=60, - ) - if "stopped" in stop_result.stdout: - print(" ✓ Docker and containerd stopped") - else: - print(f" Warning: {stop_result.stderr[:200]}") - - # Step 3: Create symlinks from /var/lib/docker and /var/lib/containerd to /mnt - print("[3/5] Creating symlinks to /mnt (most reliable method)...") - config_cmd = """ -# Create directories on /mnt -sudo mkdir -p /mnt/docker -sudo mkdir -p /mnt/containerd - -# Remove existing directories and create symlinks -sudo rm -rf /var/lib/docker 2>/dev/null -sudo rm -rf /var/lib/containerd 2>/dev/null -sudo ln -sf /mnt/docker /var/lib/docker -sudo ln -sf /mnt/containerd /var/lib/containerd - -# Verify symlinks -ls -la /var/lib/docker /var/lib/containerd 2>&1 | head -4 -echo "configured" -""" - config_result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", config_cmd], - capture_output=True, - text=True, - timeout=120, - ) - if "configured" in config_result.stdout: - print(" ✓ Docker configured to use /mnt/docker") - else: - print(f" Warning: {config_result.stderr[:200]}") - - # Step 4: Start Docker and containerd - print("[4/5] Starting Docker and containerd...") - start_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "sudo systemctl start containerd docker && sleep 3 && docker info 2>&1 | head -3", - ], - capture_output=True, - text=True, - timeout=60, - ) - if "Client:" in start_result.stdout or "Server:" in start_result.stdout: - print(" ✓ Docker started with new data root") - else: - print(f" Warning: {start_result.stderr[:200]}") - print(" Trying to start Docker again...") - subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", "sudo systemctl start docker"], - capture_output=True, - text=True, - timeout=30, - ) - - # Step 5: Verify new data root - print("[5/5] Verifying Docker data root...") - verify_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "docker info 2>&1 | grep 'Docker Root Dir'", - ], - capture_output=True, - text=True, - timeout=30, - ) - print(f" {verify_result.stdout.strip()}") - - # Check disk after - print("\n Disk usage after move:") - df_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "df -h / /mnt 2>/dev/null | grep -E 'Filesystem|/dev'", - ], - capture_output=True, - text=True, - ) - print(f" {df_result.stdout}") - - print(f"\n{'=' * 60}") - print(" Docker Data Moved to /mnt!") - print(f"{'=' * 60}") - print("\n Root disk now has space for OS only.") - print(" Docker images will use /mnt/docker (300GB available).") - print( - "\n Next: uv run python -m openadapt_ml.benchmarks.cli waa --rebuild" - ) - - elif args.action == "reset-windows": - print("\n=== Reset Windows (Clean Install) ===\n") - print("Deletes existing Windows disk image and does a fresh install.\n") - - # Get VM IP - result = subprocess.run( - [ - "az", - "vm", - "show", - "-d", - "-g", - resource_group, - "-n", - vm_name, - "--query", - "publicIps", - "-o", - "tsv", - ], - capture_output=True, - text=True, - ) - if result.returncode != 0 or not result.stdout.strip(): - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - ip = result.stdout.strip() - - print(f" VM IP: {ip}") - print() - - # Step 1: Stop container - print("[1/3] Stopping WAA container...") - subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "docker stop winarena 2>/dev/null; docker rm winarena 2>/dev/null", - ], - capture_output=True, - text=True, - ) - print(" ✓ Container stopped") - - # Step 2: Delete Windows disk image (keep ISO for faster reinstall) - print("\n[2/3] Deleting corrupted disk image (keeping ISO cache)...") - cleanup_cmd = """ -# Ensure storage is on /mnt -sudo mkdir -p /data/waa-storage -sudo chown azureuser:azureuser /data/waa-storage -# Move from home if needed -[ -d ~/waa-storage ] && mv ~/waa-storage/* /data/waa-storage/ 2>/dev/null && rm -rf ~/waa-storage -# Delete disk image but keep ISO cache -rm -f /data/waa-storage/data.img /data/waa-storage/windows.mac /data/waa-storage/windows.rom /data/waa-storage/windows.vars -ls -lh /data/waa-storage/ -""" - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", cleanup_cmd], - capture_output=True, - text=True, - ) - print(result.stdout) - print(" ✓ Disk image deleted (ISO cache preserved for faster reinstall)") - - # Step 3: Restart with fresh install using vanilla WAA - print("\n[3/3] Starting fresh Windows installation...") - api_key = settings.openai_api_key or os.environ.get("OPENAI_API_KEY", "") - docker_cmd = f"""docker run -d \\ - --name winarena \\ - --device=/dev/kvm \\ - --cap-add NET_ADMIN \\ - --stop-timeout 120 \\ - -p 8006:8006 \\ - -p 3389:3389 \\ - -v /data/waa-storage:/storage \\ - -e VERSION=11e \\ - -e RAM_SIZE=12G \\ - -e CPU_CORES=4 \\ - -e OPENAI_API_KEY='{api_key}' \\ - --entrypoint /bin/bash \\ - windowsarena/winarena:latest \\ - -c './entry.sh --prepare-image false --start-client true --agent navi --model gpt-4o --som-origin oss --a11y-backend uia'""" - - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", docker_cmd], - capture_output=True, - text=True, - timeout=60, - ) - if result.returncode != 0: - print(f" ✗ Failed to start container: {result.stderr}") - sys.exit(1) - print(" ✓ Fresh Windows installation started") - - # Wait and monitor - print("\n VNC: http://localhost:8006 (via SSH tunnel)") - print(" Windows will install automatically (~10-15 min)...") - print(" WAA server will start on port 5000 when ready.\n") - - import time - - for i in range(45): # Wait up to 15 minutes - time.sleep(20) - - # Check if WAA server /probe endpoint responds - # Use localhost - Docker port forwarding handles routing to QEMU VM - # See docs/waa_network_architecture.md for architecture details - try: - probe_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - "-o", - "ConnectTimeout=5", - f"azureuser@{ip}", - "curl -s --connect-timeout 3 http://localhost:5000/probe 2>/dev/null", - ], - capture_output=True, - text=True, - timeout=30, - ) - except subprocess.TimeoutExpired: - probe_result = None - - if probe_result and probe_result.stdout.strip(): - print("\n✓ WAA Server ready!") - print(f" Probe response: {probe_result.stdout.strip()[:100]}") - print("\n To run benchmarks:") - print( - " uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 5" - ) - break - - # Show progress from docker logs - log_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "docker logs winarena 2>&1 | tail -2", - ], - capture_output=True, - text=True, - ) - last_log = ( - log_result.stdout.strip().split("\n")[-1][:70] - if log_result.stdout - else "Starting..." - ) - print(f" [{(i + 1) * 20}s] {last_log}...") - else: - print("\n⚠ Timeout waiting for WAA. Check VNC: http://localhost:8006 (via SSH tunnel)") - print(" Windows installation may still be in progress.") - - elif args.action == "screenshot": - print("\n=== Capturing VM Screenshot ===\n") - - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - - print(f" VM IP: {ip}") - print(" Capturing screenshot via QEMU monitor...") - - output_path = Path("training_output/current/vm_screenshot.png") - result_path = capture_vm_screenshot(ip, output_path) - - if result_path: - print(f" ✓ Screenshot saved to: {result_path}") - print( - "\n View at: http://localhost:8080/vm_screenshot.png (if server running)" - ) - else: - print(" ✗ Failed to capture screenshot") - print( - " Make sure the winarena container is running and QEMU monitor is accessible." - ) - sys.exit(1) - - elif args.action == "probe": - print("\n=== Checking WAA /probe Endpoint ===\n") - - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - - print(f" VM IP: {ip}") - - # Use 172.30.0.2 for vanilla WAA (dockurr/windows base, used by windowsarena/winarena) - internal_ip = getattr(args, "internal_ip", "172.30.0.2") - - if getattr(args, "wait", False): - # Polling mode - keep checking until ready - max_attempts = getattr(args, "max_attempts", 30) - interval = getattr(args, "interval", 20) - if poll_waa_probe( - ip, - max_attempts=max_attempts, - interval=interval, - internal_ip=internal_ip, - ): - print("\n Ready to run benchmarks:") - print( - " uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 5" - ) - else: - print("\n VNC (via SSH tunnel): http://localhost:8006") - print(f" Start tunnel: ssh -L 8006:{ip}:8006 azureuser@{ip}") - sys.exit(1) - else: - # Single check mode - print(" Checking /probe endpoint...") - - is_ready, response = check_waa_probe(ip, internal_ip=internal_ip) - - if is_ready: - print(" ✓ WAA server is READY") - print(f" Response: {response[:100] if response else '(empty)'}") - print("\n Ready to run benchmarks:") - print( - " uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 5" - ) - else: - print(" ✗ WAA server NOT responding") - print("\n To poll until ready, use: vm probe --wait") - print(" VNC (via SSH tunnel): http://localhost:8006") - print(f" Start tunnel: ssh -L 8006:{ip}:8006 azureuser@{ip}") - - elif args.action == "pool-status": - from openadapt_ml.benchmarks.vm_monitor import ( - VMPoolRegistry, - VMMonitor, - VMConfig, - ) - - print("\n=== VM Pool Status ===\n") - - registry = VMPoolRegistry() - pool = registry.get_pool() - - if pool is None: - print("No active VM pool. Create one with: vm setup-waa --workers N") - sys.exit(0) - - print(f" Pool ID: {pool.pool_id}") - print(f" Created: {pool.created_at}") - print(f" Workers: {len(pool.workers)}") - print(f" Tasks: {pool.completed_tasks}/{pool.total_tasks}") - print() - - # Table header - print(f"{'Name':<15} {'IP':<16} {'Status':<12} {'WAA':<6} {'Tasks':<10}") - print("-" * 60) - - for w in pool.workers: - waa_status = "Ready" if w.waa_ready else "---" - task_progress = f"{len(w.completed_tasks)}/{len(w.assigned_tasks)}" - print( - f"{w.name:<15} {w.ip:<16} {w.status:<12} {waa_status:<6} {task_progress:<10}" - ) - - # Optionally probe each VM for live status - if getattr(args, "wait", False): # Reuse --wait flag for probing - print("\nProbing VMs for WAA readiness...") - for w in pool.workers: - monitor = VMMonitor(VMConfig(name=w.name, ssh_host=w.ip)) - status = monitor.check_status() - ready = "READY" if status.waa_ready else "Not ready" - print(f" {w.name}: {ready}") - - elif args.action == "delete-pool": - from openadapt_ml.benchmarks.vm_monitor import VMPoolRegistry - from concurrent.futures import ThreadPoolExecutor, as_completed - - registry = VMPoolRegistry() - pool = registry.get_pool() - - if pool is None: - print("No active VM pool.") - sys.exit(0) - - print(f"\n=== Deleting VM Pool: {pool.pool_id} ===\n") - print(f"This will delete {len(pool.workers)} VMs:") - for w in pool.workers: - print(f" - {w.name} ({w.ip})") - - if not getattr(args, "yes", False): - confirm = input("\nType 'yes' to confirm: ") - if confirm.lower() != "yes": - print("Aborted.") - sys.exit(0) - - # Delete VMs in parallel - def delete_vm(name: str) -> tuple[str, bool, str]: - result = subprocess.run( - ["az", "vm", "delete", "-g", pool.resource_group, "-n", name, "--yes"], - capture_output=True, - text=True, - ) - if result.returncode == 0: - return (name, True, "deleted") - else: - return (name, False, result.stderr[:100]) - - print("\nDeleting VMs...") - with ThreadPoolExecutor(max_workers=5) as executor: - futures = {executor.submit(delete_vm, w.name): w.name for w in pool.workers} - for future in as_completed(futures): - name, success, msg = future.result() - status = "✓ deleted" if success else f"✗ FAILED: {msg}" - print(f" {name}: {status}") - - # Delete registry - registry.delete_pool() - print("\n✓ Pool deleted.") - - elif args.action == "cleanup-stale": - from datetime import datetime, timezone - import json - - print("\n=== Cleanup Stale Azure Resources ===\n") - print(f" Resource Group: {resource_group}") - print(" Workspace: openadapt-ml") - print(f" Job threshold: {args.max_hours} hours") - print(f" VM threshold: {args.vm_max_hours} hours") - print() - - stale_jobs = [] - stale_vms = [] - - # --- Find stale Azure ML jobs --- - print("Checking Azure ML jobs...") - result = subprocess.run( - [ - "az", - "ml", - "job", - "list", - "--resource-group", - resource_group, - "--workspace-name", - "openadapt-ml", - "-o", - "json", - ], - capture_output=True, - text=True, - ) - - if result.returncode == 0: - try: - jobs = json.loads(result.stdout) - now = datetime.now(timezone.utc) - - for job in jobs: - status = job.get("status", "").lower() - # Only consider running/queued jobs - if status not in ["running", "queued", "preparing", "starting"]: - continue - - # Parse creation time - creation_time_str = job.get("creation_context", {}).get( - "created_at" - ) - if not creation_time_str: - continue - - # Parse ISO 8601 timestamp - try: - # Handle various datetime formats - creation_time_str = creation_time_str.replace("Z", "+00:00") - creation_time = datetime.fromisoformat(creation_time_str) - if creation_time.tzinfo is None: - creation_time = creation_time.replace(tzinfo=timezone.utc) - - duration_hours = (now - creation_time).total_seconds() / 3600 - - if duration_hours > args.max_hours: - stale_jobs.append( - { - "name": job.get("name", "unknown"), - "display_name": job.get("display_name", ""), - "status": status, - "duration_hours": duration_hours, - "created_at": creation_time_str, - } - ) - except (ValueError, TypeError): - continue - - except json.JSONDecodeError: - print(" Warning: Could not parse job list") - else: - print(f" Warning: Could not list jobs: {result.stderr[:100]}") - - # --- Find stale VMs --- - print("Checking Azure VMs...") - result = subprocess.run( - [ - "az", - "vm", - "list", - "-d", - "--resource-group", - resource_group, - "-o", - "json", - ], - capture_output=True, - text=True, - ) - - if result.returncode == 0: - try: - vms = json.loads(result.stdout) - now = datetime.now(timezone.utc) - - for vm in vms: - power_state = vm.get("powerState", "").lower() - # Only check running VMs - if "running" not in power_state: - continue - - vm_name = vm.get("name", "unknown") - - # Get VM instance view for start time - instance_result = subprocess.run( - [ - "az", - "vm", - "get-instance-view", - "--resource-group", - resource_group, - "--name", - vm_name, - "-o", - "json", - ], - capture_output=True, - text=True, - ) - - if instance_result.returncode == 0: - try: - instance_view = json.loads(instance_result.stdout) - # Look for VM start time in statuses - statuses = instance_view.get("instanceView", {}).get( - "statuses", [] - ) - for status in statuses: - if status.get("code", "").startswith( - "PowerState/running" - ): - start_time_str = status.get("time") - if start_time_str: - try: - start_time_str = start_time_str.replace( - "Z", "+00:00" - ) - start_time = datetime.fromisoformat( - start_time_str - ) - if start_time.tzinfo is None: - start_time = start_time.replace( - tzinfo=timezone.utc - ) - - duration_hours = ( - now - start_time - ).total_seconds() / 3600 - - if duration_hours > args.vm_max_hours: - stale_vms.append( - { - "name": vm_name, - "size": vm.get( - "hardwareProfile", {} - ).get("vmSize", "unknown"), - "duration_hours": duration_hours, - "public_ip": vm.get( - "publicIps", "" - ), - } - ) - except (ValueError, TypeError): - pass - break - except json.JSONDecodeError: - pass - - except json.JSONDecodeError: - print(" Warning: Could not parse VM list") - else: - print(f" Warning: Could not list VMs: {result.stderr[:100]}") - - # --- Display findings --- - print() - - if not stale_jobs and not stale_vms: - print("No stale resources found.") - return - - if stale_jobs: - print(f"=== Stale Azure ML Jobs ({len(stale_jobs)}) ===") - print(f"{'Name':<40} {'Status':<12} {'Duration':<12}") - print("-" * 64) - for job in stale_jobs: - duration_str = f"{job['duration_hours']:.1f}h" - name = job["name"][:38] + ".." if len(job["name"]) > 40 else job["name"] - print(f"{name:<40} {job['status']:<12} {duration_str:<12}") - print() - - if stale_vms: - print(f"=== Stale VMs ({len(stale_vms)}) ===") - print(f"{'Name':<25} {'Size':<20} {'Duration':<12} {'IP':<16}") - print("-" * 75) - for vm in stale_vms: - duration_str = f"{vm['duration_hours']:.1f}h" - print( - f"{vm['name']:<25} {vm['size']:<20} {duration_str:<12} {vm['public_ip']:<16}" - ) - print() - - # --- Confirmation --- - if not getattr(args, "yes", False): - actions = [] - if stale_jobs: - actions.append(f"cancel {len(stale_jobs)} job(s)") - if stale_vms: - actions.append(f"deallocate {len(stale_vms)} VM(s)") - - confirm = input(f"This will {' and '.join(actions)}. Continue? (y/N): ") - if confirm.lower() != "y": - print("Cancelled.") - return - - # --- Cancel stale jobs --- - if stale_jobs: - print("\nCancelling stale jobs...") - for job in stale_jobs: - result = subprocess.run( - [ - "az", - "ml", - "job", - "cancel", - "--name", - job["name"], - "--resource-group", - resource_group, - "--workspace-name", - "openadapt-ml", - ], - capture_output=True, - text=True, - ) - if result.returncode == 0: - print(f" Cancelled: {job['name']}") - else: - print(f" Failed to cancel {job['name']}: {result.stderr[:50]}") - - # --- Deallocate stale VMs --- - if stale_vms: - print("\nDeallocating stale VMs...") - for vm in stale_vms: - result = subprocess.run( - [ - "az", - "vm", - "deallocate", - "--resource-group", - resource_group, - "--name", - vm["name"], - "--no-wait", - ], - capture_output=True, - text=True, - ) - if result.returncode == 0: - print(f" Deallocating: {vm['name']} (running in background)") - else: - print(f" Failed to deallocate {vm['name']}: {result.stderr[:50]}") - - print("\nCleanup complete.") - - elif args.action == "monitor": - import json - import socket - import webbrowser - import threading - import time - from datetime import datetime, timedelta - from openadapt_ml.benchmarks.vm_monitor import ( - fetch_azure_ml_jobs, - calculate_vm_costs, - get_vm_uptime_hours, - detect_vm_activity, - get_evaluation_history, - VMActivity, - AzureMLJob, - EvaluationRun, - ) - - port = getattr(args, "port", 8765) - auto_shutdown_hours = getattr(args, "auto_shutdown_hours", 0) - show_details = getattr(args, "details", False) - use_mock = getattr(args, "mock", False) - - print("\n" + "=" * 70) - print(" VM MONITOR DASHBOARD ".center(70)) - if use_mock: - print(" [MOCK DATA MODE - No VM Required] ".center(70)) - print("=" * 70 + "\n") - - # ===== MOCK DATA GENERATION ===== - if use_mock: - # Generate realistic mock data for screenshots/testing - ip = "172.171.112.41" - vm_size = "Standard_D8ds_v5" - power_state = "VM running" - uptime_hours = 2.5 - - activity = VMActivity( - is_active=True, - activity_type="benchmark_running", - description="WAA benchmark ready (154 tasks)", - ) - - jobs = [ - AzureMLJob( - job_id="abc123def456", - display_name="waa-eval-20-tasks", - status="completed", - created_at="2026-01-15T10:30:00Z", - ), - AzureMLJob( - job_id="ghi789jkl012", - display_name="waa-eval-50-tasks", - status="running", - created_at="2026-01-17T08:15:00Z", - ), - ] - - history = [ - EvaluationRun( - run_id="20260115_103045", - started_at="2026-01-15T10:30:45Z", - completed_at="2026-01-15T12:15:30Z", - num_tasks=20, - success_rate=0.65, - agent_type="api-claude", - status="completed", - ), - EvaluationRun( - run_id="20260110_145530", - started_at="2026-01-10T14:55:30Z", - completed_at="2026-01-10T16:20:15Z", - num_tasks=10, - success_rate=0.80, - agent_type="navi", - status="completed", - ), - ] - - costs = calculate_vm_costs(vm_size, uptime_hours) - - # ===== VM STATUS ===== - print("1. VM STATUS") - print("-" * 70) - - if not use_mock: - ip = get_vm_ip(resource_group, vm_name) - - if ip: - print(f" Name: {vm_name}") - print(f" IP Address: {ip}") - print(f" Resource: {resource_group}") - - # Get VM size for cost calculation - if not use_mock: - vm_info_result = subprocess.run( - [ - "az", - "vm", - "show", - "-d", - "-g", - resource_group, - "-n", - vm_name, - "--query", - "{size:hardwareProfile.vmSize,powerState:powerState}", - "-o", - "json", - ], - capture_output=True, - text=True, - timeout=10, - ) - vm_size = "Standard_D8ds_v5" # default - power_state = "unknown" - if vm_info_result.returncode == 0: - vm_info = json.loads(vm_info_result.stdout) - vm_size = vm_info.get("size", vm_size) - power_state = vm_info.get("powerState", "unknown") - - print(f" VM Size: {vm_size}") - print(f" State: {power_state}") - else: - print(f" ✗ VM '{vm_name}' not found") - print(" Run: uv run python -m openadapt_ml.benchmarks.cli vm setup-waa") - sys.exit(1) - - # ===== VM ACTIVITY ===== - print("\n2. CURRENT ACTIVITY") - print("-" * 70) - if not use_mock: - activity = detect_vm_activity(ip, "azureuser", "winarena", "172.30.0.2") - activity_icon = "⚙" if activity.is_active else "💤" - print(f" Status: {activity_icon} {activity.activity_type.upper()}") - print(f" Details: {activity.description}") - - # ===== COST TRACKING ===== - print("\n3. COST TRACKING") - print("-" * 70) - if not use_mock: - uptime_hours = get_vm_uptime_hours(resource_group, vm_name) - costs = calculate_vm_costs(vm_size, uptime_hours) - print(f" Uptime: {uptime_hours:.2f} hours") - print(f" Rate: ${costs.hourly_rate_usd:.3f}/hour") - print(f" Cost: ${costs.cost_usd:.2f} (current session)") - if show_details: - print(f" Daily: ${costs.cost_per_day_usd:.2f}/day") - print(f" Weekly: ${costs.cost_per_week_usd:.2f}/week") - - # ===== AZURE ML JOBS ===== - print("\n4. RECENT AZURE ML JOBS (Last 7 Days)") - print("-" * 70) - if not use_mock: - jobs = fetch_azure_ml_jobs( - resource_group=resource_group, days=7, max_results=5 - ) - if jobs: - for job in jobs[:5]: # Show top 5 - status_icon = { - "running": "▶", - "completed": "✓", - "failed": "✗", - "canceled": "⊗", - }.get(job.status, "?") - created_date = ( - job.created_at[:10] if len(job.created_at) >= 10 else job.created_at - ) - print(f" {status_icon} {job.display_name or job.job_id[:12]}") - print(f" Status: {job.status} | Created: {created_date}") - if show_details and job.azure_dashboard_url: - print(f" URL: {job.azure_dashboard_url[:70]}...") - else: - print(" No recent jobs found") - - # ===== EVALUATION HISTORY ===== - if show_details: - print("\n5. EVALUATION HISTORY") - print("-" * 70) - if not use_mock: - history = get_evaluation_history(max_runs=5) - if history: - for run in history[:5]: - success_pct = ( - f"{run.success_rate * 100:.1f}%" if run.success_rate else "N/A" - ) - print(f" • {run.run_id}") - print( - f" Tasks: {run.num_tasks} | Success: {success_pct} | Agent: {run.agent_type}" - ) - else: - print(" No evaluation history found") - - # ===== DASHBOARD & TUNNELS ===== - print("\n6. DASHBOARD & ACCESS") - print("-" * 70) - - # In mock mode, skip dashboard and exit cleanly - if use_mock: - print(" Dashboard: (Skipped in mock mode)") - print(" VNC: (Skipped in mock mode)") - print(f"\n{'=' * 70}") - print(" Mock data displayed successfully!") - print("=" * 70 + "\n") - return # Exit early for mock mode - - # Check if server is already running on port - def is_port_in_use(port: int) -> bool: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - return s.connect_ex(("localhost", port)) == 0 - - if is_port_in_use(port): - print(f" ✓ Dashboard already running on port {port}") - else: - print(f" Starting dashboard server on port {port}...") - # Start server in background - from openadapt_ml.cloud.local import ( - get_current_output_dir, - _regenerate_benchmark_viewer_if_available, - ) - - serve_dir = get_current_output_dir().resolve() - if not serve_dir.exists(): - serve_dir.mkdir(parents=True) - _regenerate_benchmark_viewer_if_available(serve_dir) - - def start_server(): - from openadapt_ml.cloud.local import cmd_serve - import argparse - - fake_args = argparse.Namespace( - port=port, - open=False, - no_regenerate=True, - quiet=True, - benchmark=str(serve_dir), - start_page=None, - ) - cmd_serve(fake_args) - - server_thread = threading.Thread(target=start_server, daemon=True) - server_thread.start() - time.sleep(1) - print(f" ✓ Dashboard started on port {port}") - - # Start SSH tunnels for VNC and WAA - try: - from openadapt_ml.cloud.ssh_tunnel import get_tunnel_manager - - tunnel_manager = get_tunnel_manager() - tunnel_manager.start_tunnels_for_vm(ip, "azureuser") - tunnel_status = tunnel_manager.get_tunnel_status() - if tunnel_status.get("vnc") and tunnel_status["vnc"].active: - print(f" ✓ VNC tunnel: localhost:8006 -> {ip}:8006") - else: - print( - f" ⚠ VNC tunnel failed - use: ssh -L 8006:{ip}:8006 azureuser@{ip}" - ) - except Exception as e: - print(f" ⚠ Tunnel error: {str(e)[:50]}") - - # URLs - Use azure_ops.html for VM monitoring (has SSE for live updates) - url = f"http://localhost:{port}/azure_ops.html" - print(f"\n Dashboard: {url}") - print(" VNC: http://localhost:8006") - - # Auto-shutdown info - if auto_shutdown_hours > 0: - shutdown_time = datetime.now() + timedelta(hours=auto_shutdown_hours) - print( - f" Shutdown: {shutdown_time.strftime('%H:%M:%S')} ({auto_shutdown_hours}h)" - ) - - print(f"\n{'=' * 70}") - print(" Press Ctrl+C to stop monitoring") - print("=" * 70 + "\n") - - # Open browser - webbrowser.open(url) - - # Initialize trackers for live dashboard updates - from openadapt_ml.benchmarks.azure_ops_tracker import get_tracker - from openadapt_ml.benchmarks.session_tracker import start_session, get_session - - # Start session tracking (persists across page refreshes) - session = start_session(vm_size=vm_size, vm_ip=ip) - - # Initialize ops tracker with current VM state - tracker = get_tracker(vm_size=vm_size) - tracker.start_operation( - operation="monitor", - phase="Monitoring VM", - vm_ip=ip, - vm_state="running" if "running" in power_state.lower() else "unknown", - ) - - # Track start time for auto-shutdown and updates - start_time = datetime.now() - last_update = datetime.now() - update_interval = 5 # Update every 5 seconds for smoother dashboard - - # Keep running to maintain dashboard and show live status - try: - while True: - current_time = datetime.now() - elapsed = current_time - start_time - elapsed_str = f"{int(elapsed.total_seconds() // 3600)}h{int((elapsed.total_seconds() % 3600) // 60)}m" - - # Update status every update_interval seconds - if (current_time - last_update).total_seconds() >= update_interval: - # Quick status check - is_ready, probe_msg = check_waa_probe(ip, internal_ip="172.30.0.2") - activity = detect_vm_activity( - ip, "azureuser", "winarena", "172.30.0.2" - ) - status_line = f"WAA: {'READY' if is_ready else 'waiting'} | Activity: {activity.activity_type}" - - # Update tracker for dashboard SSE - tracker.update( - phase=f"{activity.activity_type}: {activity.description}", - vm_ip=ip, - vm_state="running", - log_lines=[ - f"[{time.strftime('%H:%M:%S')}] WAA: {'READY' if is_ready else 'waiting'}", - f"[{time.strftime('%H:%M:%S')}] Activity: {activity.activity_type}", - f"[{time.strftime('%H:%M:%S')}] {activity.description}", - ], - ) - last_update = current_time - else: - # Use cached status - is_ready, _ = check_waa_probe(ip, internal_ip="172.30.0.2") - status_line = f"WAA: {'READY' if is_ready else 'waiting'}" - - # Live status display - print( - f" [{time.strftime('%H:%M:%S')}] {status_line} | Uptime: {elapsed_str} ", - end="\r", - ) - - # Check auto-shutdown timeout - if ( - auto_shutdown_hours > 0 - and elapsed.total_seconds() >= auto_shutdown_hours * 3600 - ): - print(f"\n\n Auto-shutdown triggered after {auto_shutdown_hours}h") - deallocate_result = subprocess.run( - [ - "az", - "vm", - "deallocate", - "-g", - resource_group, - "-n", - vm_name, - "--no-wait", - ], - capture_output=True, - text=True, - ) - if deallocate_result.returncode == 0: - print(f" ✓ VM '{vm_name}' deallocation initiated") - else: - print( - f" ✗ Failed to deallocate: {deallocate_result.stderr[:50]}" - ) - break - - time.sleep(5) - except KeyboardInterrupt: - print("\n\n Monitoring stopped.") - - elif args.action == "exec": - # Execute command in container - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - - cmd = getattr(args, "cmd", None) - if not cmd: - print("✗ No command specified. Use: vm exec --cmd 'your command'") - sys.exit(1) - - result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - f"docker exec winarena sh -c '{cmd}'", - ], - capture_output=True, - text=True, - ) - print(result.stdout) - if result.returncode != 0: - print(f"Error: {result.stderr}") - - elif args.action == "host-exec": - # Execute command on VM host (not in container) - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - - cmd = getattr(args, "cmd", None) - if not cmd: - print("✗ No command specified. Use: vm host-exec --cmd 'your command'") - sys.exit(1) - - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", cmd], capture_output=True, text=True - ) - print(result.stdout) - if result.returncode != 0: - print(f"Error: {result.stderr}") - - elif args.action == "test-docker": - # Test docker run command to debug startup issues - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - - print("\n=== Testing Docker Run Command ===\n") - print(f" VM IP: {ip}") - - # First check for port conflicts - print("\n[1/3] Checking for port conflicts...") - check_cmd = "docker ps -a --format '{{.Names}} {{.Ports}}' 2>/dev/null || echo 'No containers'" - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", check_cmd], - capture_output=True, - text=True, - ) - print(f" Containers: {result.stdout.strip()}") - - # Clean up any conflicting containers - print("\n[2/3] Cleaning up old containers...") - cleanup_cmd = "docker rm -f winarena winarena-test 2>/dev/null || true" - subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", cleanup_cmd], capture_output=True - ) - - # Build a docker command to test the vanilla WAA image - # Note: vanilla WAA uses --entrypoint /bin/bash and runs entry.sh - docker_cmd = '''docker run --rm \ - --name winarena-test \ - --device=/dev/kvm \ - --cap-add NET_ADMIN \ - -p 8006:8006 \ - -p 3389:3389 \ - -v /data/waa-storage:/storage \ - -v ~/waa-results:/results \ - --entrypoint /bin/bash \ - windowsarena/winarena:latest \ - -c "echo OEM_FILES_COPIED && ls -la /tmp/smb/ 2>/dev/null || echo 'No /tmp/smb/ dir'"''' - - print("\n[3/3] Testing docker run with waa-entry.sh...") - print(f" Command: {docker_cmd[:100]}...") - - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", docker_cmd], - capture_output=True, - text=True, - timeout=60, - ) - - print(f"\n Exit code: {result.returncode}") - if result.stdout: - print(f"\n STDOUT:\n{result.stdout}") - if result.stderr: - print(f"\n STDERR:\n{result.stderr}") - - if "OEM_FILES_COPIED" in result.stdout and "install.bat" in result.stdout: - print("\n✓ Docker test PASSED - OEM files copied correctly") - else: - print("\n✗ Docker test FAILED - OEM files not copied") - - elif args.action == "start-server": - # DEPRECATED: With vanilla WAA, the server starts automatically via entry.sh - # This action is kept for backward compatibility but now just restarts the container - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - - print("\n=== Restarting WAA Container ===\n") - print(" NOTE: With vanilla WAA, the server starts automatically.") - print(" This command restarts the container to trigger server startup.\n") - print(f" VM IP: {ip}") - - # Restart the container - entry.sh will start the server - print("[1/2] Restarting winarena container...") - result = subprocess.run( - ssh_cmd(ip, "docker restart winarena"), - capture_output=True, - text=True, - timeout=60, - ) - if result.returncode != 0: - print(f"✗ Failed to restart container: {result.stderr[:200]}") - sys.exit(1) - print(" Container restarted") - - # Wait and verify server is running - print("[2/2] Waiting for server to start...") - import time - - for i in range(12): - time.sleep(10) - is_ready, response = check_waa_probe(ip, internal_ip="172.30.0.2") - if is_ready: - print("\n WAA server is running!") - print(f" Response: {response}") - break - print(f" Attempt {i + 1}/12: Not ready yet...") - else: - print("\n Server may not have started. Check VNC at http://localhost:8006") - - elif args.action == "fix-oem": - # Copy OEM files to Samba share (fixes missing install.bat) - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - - print("Copying OEM files to Samba share...") - result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "docker exec winarena sh -c 'cp -r /oem/* /tmp/smb/ 2>&1 && ls -la /tmp/smb/'", - ], - capture_output=True, - text=True, - ) - print(result.stdout) - if result.returncode == 0: - print( - "✓ OEM files copied. In Windows, run: \\\\host.lan\\Data\\install.bat" - ) - else: - print(f"Error: {result.stderr}") - - elif args.action == "logs": - # Get VM IP - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - - num_lines = getattr(args, "lines", 50) - follow = getattr(args, "follow", False) - - if follow: - # Follow logs (streaming) - print(f"Following logs from winarena container on {ip}...") - print("Press Ctrl+C to stop.\n") - import os - - os.execvp( - "ssh", ["ssh", *SSH_OPTS, f"azureuser@{ip}", "docker logs -f winarena"] - ) - else: - result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - f"docker logs --tail {num_lines} winarena 2>&1", - ], - capture_output=True, - text=True, - ) - print(result.stdout) - if result.returncode != 0: - print(f"Error: {result.stderr}") - - elif args.action == "stop-build": - print("\n=== Stop Docker Build on VM ===\n") - - # Get VM IP - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found.") - sys.exit(1) - - print(f" VM IP: {ip}") - print() - - # Kill Docker build processes - print("[1/2] Stopping Docker build processes...") - kill_cmd = """ -pkill -f 'docker build' 2>/dev/null || true -pkill -f 'docker-buildx' 2>/dev/null || true -pkill -f buildkitd 2>/dev/null || true -echo "killed" -""" - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", kill_cmd], - capture_output=True, - text=True, - timeout=30, - ) - if "killed" in result.stdout: - print(" ✓ Build processes stopped") - else: - print(f" Warning: {result.stderr[:200]}") - - # Clean up Docker build cache - print("[2/2] Cleaning Docker build cache...") - prune_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "docker builder prune -af 2>&1 | tail -5", - ], - capture_output=True, - text=True, - timeout=120, - ) - print(f" {prune_result.stdout}") - print( - "\n Ready to retry: uv run python -m openadapt_ml.benchmarks.cli waa --rebuild" - ) - - elif args.action == "diag": - print(f"\n=== VM Diagnostics: {vm_name} ===\n") - - # Check VM running state first (fast Azure API call) - print("[0/4] Checking VM state...") - is_running, power_state = check_vm_running(resource_group, vm_name) - if power_state == "not_found": - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - if not is_running: - print(f"✗ VM '{vm_name}' is not running (state: {power_state})") - print(" Start it with: uv run python -m openadapt_ml.benchmarks.cli vm start") - sys.exit(1) - print(f" ✓ VM is running ({power_state})") - - # Get VM IP - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ Could not get IP for VM '{vm_name}'") - sys.exit(1) - - print(f" VM IP: {ip}") - print() - - # Test SSH connectivity first with retry - print("[0.5/4] Testing SSH connectivity...") - try: - result = run_ssh_with_retry(ip, "echo 'SSH OK'", max_retries=3, verbose=True) - if result.returncode != 0: - print(f" ✗ SSH connection failed: {result.stderr[:100]}") - sys.exit(1) - print(" ✓ SSH connection established") - except subprocess.SubprocessError as e: - print(f" ✗ {e}") - print("\n Possible causes:") - print(" - VM is still booting (wait 1-2 minutes)") - print(" - Network security group blocking SSH") - print(" - SSH daemon not running on VM") - sys.exit(1) - print() - - # Helper for running diag commands with retry - def run_diag_cmd(cmd: str) -> tuple[bool, str, str]: - """Run diagnostic command with retry. Returns (success, stdout, stderr).""" - try: - result = run_ssh_with_retry(ip, cmd, max_retries=2, verbose=False) - return result.returncode == 0, result.stdout, result.stderr - except subprocess.SubprocessError: - return False, "", "SSH connection failed" - - # Disk usage - print("[1/4] Disk Usage") - print("-" * 50) - success, stdout, stderr = run_diag_cmd("df -h / /mnt 2>/dev/null || df -h /") - if success: - print(stdout) - else: - print(f" Error: {stderr[:100]}") - - # Docker info - print("[2/4] Docker Status") - print("-" * 50) - success, stdout, stderr = run_diag_cmd( - "docker system df 2>/dev/null || echo 'Docker not installed'" - ) - if success: - print(stdout) - else: - print(f" Error: {stderr[:100]}") - - # Docker images - print("[3/4] Docker Images") - print("-" * 50) - success, stdout, stderr = run_diag_cmd( - "docker images --format 'table {{.Repository}}:{{.Tag}}\t{{.Size}}\t{{.CreatedSince}}' 2>/dev/null || echo 'Docker not installed'" - ) - if success: - print(stdout) - else: - print(f" Error: {stderr[:100]}") - - # Running containers - print("[4/4] Running Containers") - print("-" * 50) - success, stdout, stderr = run_diag_cmd( - "docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Ports}}' 2>/dev/null || echo 'Docker not installed'" - ) - if success: - print(stdout) - else: - print(f" Error: {stderr[:100]}") - - # WAA probe status - print("\n[Bonus] WAA Probe Status") - print("-" * 50) - is_ready, response = check_waa_probe(ip, internal_ip="172.30.0.2") - if is_ready: - print(f" ✓ WAA server READY: {response[:100] if response else '(empty)'}") - else: - print(" ✗ WAA server not responding") - - print("\n VNC: http://localhost:8006 (via SSH tunnel)") - print(f" SSH: ssh azureuser@{ip}") - - elif args.action == "start-windows": - """Start the Windows container using vanilla WAA image. - - This starts the winarena container with windowsarena/winarena, which - includes automatic Windows setup and WAA server installation via entry.sh. - """ - print("\n=== Starting Windows Container ===\n") - - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'waa --setup-only' first.") - sys.exit(1) - - print(f" VM IP: {ip}") - print() - - # Check if vanilla WAA image exists - print("[1/3] Checking for windowsarena/winarena image...") - check_cmd = "docker images windowsarena/winarena:latest --format '{{.ID}}' | head -1" - check_result = subprocess.run( - ssh_cmd(ip, check_cmd), - capture_output=True, - text=True, - ) - if not check_result.stdout.strip(): - print(" ✗ windowsarena/winarena image not found!") - print(" Pull it with: uv run python -m openadapt_ml.benchmarks.cli waa --setup-only") - sys.exit(1) - print(" ✓ windowsarena/winarena image found") - - # Stop any existing container - print("[2/3] Stopping any existing container...") - subprocess.run( - ssh_cmd(ip, "docker stop winarena 2>/dev/null; docker rm -f winarena 2>/dev/null"), - capture_output=True, - text=True, - ) - print(" ✓ Cleaned up") - - # Start the container using vanilla WAA with entry.sh - print("[3/3] Starting Windows container...") - api_key = settings.openai_api_key or os.environ.get("OPENAI_API_KEY", "") - model = args.model if hasattr(args, "model") and args.model else "gpt-4o" - docker_cmd = f"""docker run -d \\ - --name winarena \\ - --device=/dev/kvm \\ - --cap-add NET_ADMIN \\ - --stop-timeout 120 \\ - -p 8006:8006 \\ - -p 3389:3389 \\ - -v /data/waa-storage:/storage \\ - -e VERSION=11e \\ - -e RAM_SIZE=12G \\ - -e CPU_CORES=4 \\ - -e OPENAI_API_KEY='{api_key}' \\ - --entrypoint /bin/bash \\ - windowsarena/winarena:latest \\ - -c './entry.sh --prepare-image false --start-client true --agent navi --model {model} --som-origin oss --a11y-backend uia'""" - - result = subprocess.run( - ssh_cmd(ip, docker_cmd), - capture_output=True, - text=True, - timeout=60, - ) - if result.returncode != 0: - print(f" ✗ Failed to start container: {result.stderr}") - sys.exit(1) - - print(" ✓ Container started") - print("\n VNC: http://localhost:8006 (via SSH tunnel)") - print(" Check probe: uv run python -m openadapt_ml.benchmarks.cli vm probe --wait") - - elif args.action == "restart-windows": - """Stop and restart the Windows container. - - This is useful when Windows becomes unresponsive or you need to - apply changes to the container configuration. - """ - print("\n=== Restarting Windows Container ===\n") - - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - - print(f" VM IP: {ip}") - print() - - # Stop container - print("[1/2] Stopping container...") - stop_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "docker stop winarena 2>&1 && echo 'stopped' || echo 'not_running'", - ], - capture_output=True, - text=True, - timeout=60, - ) - if "stopped" in stop_result.stdout: - print(" ✓ Container stopped") - else: - print(" Container was not running") - - # Restart container using vanilla WAA - print("[2/2] Starting container...") - # Always remove old container and create fresh one to ensure correct settings - api_key = settings.openai_api_key or os.environ.get("OPENAI_API_KEY", "") - model = args.model if hasattr(args, "model") and args.model else "gpt-4o" - docker_cmd = ( - "docker rm -f winarena 2>/dev/null; docker run -d " - "--name winarena " - "--device=/dev/kvm " - "--cap-add NET_ADMIN " - "--stop-timeout 120 " - "-p 8006:8006 " - "-p 3389:3389 " - "-v /data/waa-storage:/storage " - "-e VERSION=11e " - "-e RAM_SIZE=12G " - "-e CPU_CORES=4 " - f"-e OPENAI_API_KEY='{api_key}' " - "--entrypoint /bin/bash " - "windowsarena/winarena:latest " - f"-c './entry.sh --prepare-image false --start-client true --agent navi --model {model} --som-origin oss --a11y-backend uia'" - ) - start_result = subprocess.run( - ssh_cmd(ip, docker_cmd), - capture_output=True, - text=True, - timeout=60, - ) - if start_result.returncode == 0: - print(" ✓ Container started") - else: - print(f" ✗ Failed: {start_result.stderr[:200]}") - sys.exit(1) - - print("\n VNC: http://localhost:8006 (via SSH tunnel)") - print(" Windows will resume where it left off.") - print(" Check status: uv run python -m openadapt_ml.benchmarks.cli vm probe --wait") - - elif args.action == "check-build": - """Check Docker build status from /tmp/waa_build.log. - - Useful for monitoring background builds started with nohup. - """ - print("\n=== Docker Build Status ===\n") - - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found. Run 'vm setup-waa' first.") - sys.exit(1) - - print(f" VM IP: {ip}") - print() - - # Check if build process is running - print("[1/3] Checking for running build process...") - ps_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "pgrep -fa 'docker build' 2>/dev/null || echo 'no_build_running'", - ], - capture_output=True, - text=True, - ) - if "no_build_running" in ps_result.stdout: - print(" No Docker build currently running") - else: - print(f" Build in progress: {ps_result.stdout.strip()[:80]}") - - # Check if vanilla WAA image exists - print("\n[2/3] Checking for windowsarena/winarena image...") - check_cmd = "docker images windowsarena/winarena:latest --format '{{.Repository}}:{{.Tag}} {{.Size}} {{.CreatedAt}}'" - check_result = subprocess.run( - ssh_cmd(ip, check_cmd), - capture_output=True, - text=True, - ) - if check_result.stdout.strip(): - print(f" ✓ Image exists: {check_result.stdout.strip()}") - else: - print(" ✗ windowsarena/winarena image not found") - - # Show build log if it exists - print("\n[3/3] Build log (last 30 lines)...") - log_result = subprocess.run( - [ - "ssh", - *SSH_OPTS, - f"azureuser@{ip}", - "tail -30 /tmp/waa_build.log 2>/dev/null || echo 'No build log found'", - ], - capture_output=True, - text=True, - ) - print("-" * 60) - print(log_result.stdout) - print("-" * 60) - - # Helpful next steps - if "no_build_running" in ps_result.stdout: - if check_result.stdout.strip(): - print("\n Build complete! Run benchmark:") - print(" uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 5") - else: - print("\n No image found. Start a build:") - print(" uv run python -m openadapt_ml.benchmarks.cli waa --rebuild") - else: - print("\n Build in progress. Check again later or stop it:") - print(" uv run python -m openadapt_ml.benchmarks.cli vm stop-build") - - elif args.action == "fix-docker": - """Fix Docker/containerd services on the VM. - - Restarts containerd and docker services to recover from common failures - like 'containerd socket not responding' or 'docker daemon failed to start'. - - Usage: - uv run python -m openadapt_ml.benchmarks.cli vm fix-docker - """ - print("\n=== Fixing Docker/Containerd Services ===\n") - - ip = get_vm_ip(resource_group, vm_name) - if not ip: - print(f"✗ VM '{vm_name}' not found or not running.") - sys.exit(1) - - print(f" VM IP: {ip}") - print() - - # Step 1: Stop services - print("[1/4] Stopping services...") - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", - "sudo systemctl stop docker containerd 2>&1 || true"], - capture_output=True, text=True, timeout=30, - ) - print(" ✓ Services stopped") - - # Step 2: Clean up stale sockets - print("[2/4] Cleaning up stale sockets...") - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", - "sudo rm -f /run/containerd/containerd.sock /var/run/docker.sock 2>&1 || true"], - capture_output=True, text=True, timeout=30, - ) - print(" ✓ Sockets cleaned") - - # Step 3: Restart containerd first (docker depends on it) - print("[3/4] Starting containerd...") - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", - "sudo systemctl start containerd && sleep 3 && sudo systemctl status containerd --no-pager | head -10"], - capture_output=True, text=True, timeout=60, - ) - if "active (running)" in result.stdout: - print(" ✓ containerd running") - else: - print(f" ⚠ containerd status:\n{result.stdout[:300]}") - - # Step 4: Start docker - print("[4/4] Starting docker...") - result = subprocess.run( - ["ssh", *SSH_OPTS, f"azureuser@{ip}", - "sudo systemctl start docker && sleep 3 && docker ps 2>&1"], - capture_output=True, text=True, timeout=60, - ) - if result.returncode == 0: - print(" ✓ Docker running") - print(f"\n Output:\n{result.stdout}") - else: - print(f" ✗ Docker failed:\n{result.stderr[:300]}") - print("\n Try recreating the VM if Docker won't recover:") - print(" uv run python -m openadapt_ml.benchmarks.cli vm delete -y") - print(" uv run python -m openadapt_ml.benchmarks.cli vm setup-waa") - sys.exit(1) - - print("\n✓ Docker services recovered!") - print(" Next: uv run python -m openadapt_ml.benchmarks.cli vm diag") - - -def cmd_view(args: argparse.Namespace) -> None: - """View benchmark results from collected data. - - Generates an HTML viewer for benchmark results and optionally serves it. - Uses cmd_serve from local.py for full API support (including /api/vms). - - Usage: - uv run python -m openadapt_ml.benchmarks.cli view --run-name {name} - """ - from openadapt_ml.benchmarks.viewer import generate_benchmark_viewer - from openadapt_ml.cloud.local import cmd_serve - - benchmark_dir = Path(args.output) / args.run_name - - if not benchmark_dir.exists(): - print(f"Error: Benchmark directory not found: {benchmark_dir}") - print(f"\nAvailable runs in {args.output}/:") - output_dir = Path(args.output) - if output_dir.exists(): - runs = [d.name for d in output_dir.iterdir() if d.is_dir()] - if runs: - for run in sorted(runs): - print(f" - {run}") - else: - print(" (no benchmark runs found)") - else: - print(" (directory does not exist)") - sys.exit(1) - - print("\n=== Benchmark Viewer ===\n") - print(f" Run: {args.run_name}") - print(f" Directory: {benchmark_dir}") - - # Generate the HTML viewer - print("\n[1/2] Generating HTML viewer...") - output_path = generate_benchmark_viewer( - benchmark_dir=benchmark_dir, - output_path=benchmark_dir / "benchmark.html", - embed_screenshots=getattr(args, "embed_screenshots", False), - ) - print(f" Generated: {output_path}") - - # Serve the viewer using cmd_serve for full API support - print(f"\n[2/2] Starting server on port {args.port}...") - - # Create args namespace for cmd_serve - serve_args = argparse.Namespace( - port=args.port, - benchmark=str(benchmark_dir), - no_regenerate=True, # Already generated above - start_page="benchmark.html", - quiet=True, - open=not getattr(args, "no_open", False), - ) - - cmd_serve(serve_args) - - -def cmd_export_traces(args: argparse.Namespace) -> None: - """Export WAA benchmark traces as training data. - - Filters benchmark execution traces by status and exports them in - Episode format suitable for VLM fine-tuning. - - Usage: - uv run python -m openadapt_ml.benchmarks.cli export-traces --run-name waa_eval_20241214 --status passed --output training_data/ - uv run python -m openadapt_ml.benchmarks.cli export-traces --list - """ - from openadapt_ml.benchmarks.trace_export import ( - export_traces, - list_available_runs, - ) - - # List available runs - if getattr(args, "list", False): - print("\n=== Available Benchmark Runs ===\n") - runs = list_available_runs(args.input) - - if not runs: - print(f"No benchmark runs found in {args.input}/") - print("\nRun a benchmark first:") - print( - " uv run python -m openadapt_ml.benchmarks.cli test-collection --tasks 5" - ) - return - - for run in runs: - success_rate = run.get("success_rate", 0) * 100 - num_tasks = run.get("num_tasks", 0) - num_success = run.get("num_success", 0) - print(f" {run['run_name']}") - print(f" Model: {run.get('model_id', 'unknown')}") - print( - f" Tasks: {num_tasks} ({num_success} passed, {success_rate:.1f}% success)" - ) - print(f" Created: {run.get('created_at', 'N/A')}") - print() - return - - # Validate run-name is provided - if not args.run_name: - print("Error: --run-name is required (or use --list to see available runs)") - sys.exit(1) - - benchmark_dir = Path(args.input) / args.run_name - - if not benchmark_dir.exists(): - print(f"Error: Benchmark run not found: {benchmark_dir}") - print(f"\nAvailable runs in {args.input}/:") - runs = list_available_runs(args.input) - if runs: - for run in runs: - print(f" - {run['run_name']}") - else: - print(" (no runs found)") - sys.exit(1) - - print("\n=== Export WAA Traces as Training Data ===\n") - print(f" Source: {benchmark_dir}") - print(f" Output: {args.output}") - print(f" Filter: {args.status}") - print(f" Screenshots: {'copy' if not args.no_screenshots else 'skip'}") - print(f" JSONL: {'create' if not args.no_jsonl else 'skip'}") - print() - - try: - episodes = export_traces( - benchmark_dir=benchmark_dir, - output_dir=args.output, - status_filter=args.status, - copy_screenshots=not args.no_screenshots, - create_jsonl=not args.no_jsonl, - ) - - print("\n=== Export Complete ===") - print(f" Exported {len(episodes)} episodes") - print(f" Total steps: {sum(len(ep.steps) for ep in episodes)}") - print("\nOutput files:") - print(f" Episodes: {args.output}/episodes/") - if not args.no_screenshots: - print(f" Screenshots: {args.output}/screenshots/") - if not args.no_jsonl: - print(f" Training: {args.output}/training_samples.jsonl") - print(f" Manifest: {args.output}/manifest.json") - print() - - # Show sample usage - print("Next steps:") - print(" # Load episodes in Python:") - print(" from openadapt_ml.schema import load_episode") - print( - f" episode = load_episode('{args.output}/episodes/{episodes[0].episode_id}.json')" - if episodes - else "" - ) - print() - - except Exception as e: - print(f"\nError: {e}") - if args.verbose: - import traceback - - traceback.print_exc() - sys.exit(1) - - -def cmd_screenshot(args: argparse.Namespace) -> None: - """Capture screenshots of dashboards and VMs for documentation. - - Usage: - uv run python -m openadapt_ml.benchmarks.cli screenshot - uv run python -m openadapt_ml.benchmarks.cli screenshot --target terminal - uv run python -m openadapt_ml.benchmarks.cli screenshot --list - uv run python -m openadapt_ml.benchmarks.cli screenshot --waa --pr-mode - """ - from openadapt_ml.scripts.capture_screenshots import ( - TARGETS, - PROJECT_ROOT, - capture_azure_ops_dashboard, - capture_training_dashboard, - capture_vm_monitor, - capture_vm_screenshot_from_vm, - capture_vnc_screenshot, - get_timestamp, - ) - - # List available targets - if getattr(args, "list", False): - print("\nAvailable screenshot targets:\n") - for name, info in TARGETS.items(): - print(f" {name:15} - {info['description']}") - print() - return - - # Determine targets - if getattr(args, "waa", False): - # WAA-specific targets for PR documentation - targets = ["status", "probe", "vm-screen", "diag", "vnc"] - else: - targets = args.target or list(TARGETS.keys()) - output_dir = Path(args.output) - output_dir.mkdir(parents=True, exist_ok=True) - - print("=" * 60) - print(" Screenshot Capture ".center(60)) - print("=" * 60) - print(f"\nOutput: {output_dir}") - print(f"Targets: {', '.join(targets)}\n") - - timestamp = get_timestamp() if not args.no_timestamp else "" - results = {} - - for target in targets: - info = TARGETS[target] - print(f"\n[{target}] {info['description']}") - - filename = info["filename"] - if timestamp: - filename = f"{filename}_{timestamp}" - output_path = output_dir / f"{filename}.png" - - try: - success = info["capture_fn"](output_path) - if success: - size_kb = output_path.stat().st_size / 1024 - print(f" OK: {output_path.name} ({size_kb:.1f} KB)") - results[target] = str(output_path) - else: - print(" SKIP: Not available or capture failed") - results[target] = None - except Exception as e: - print(f" ERROR: {e}") - results[target] = None - - # Summary - print("\n" + "-" * 60) - successful = [t for t, p in results.items() if p] - failed = [t for t, p in results.items() if not p] - - if successful: - print(f"Captured ({len(successful)}): {', '.join(successful)}") - if failed: - print(f"Skipped ({len(failed)}): {', '.join(failed)}") - - # Generate PR-ready markdown if requested - if getattr(args, "pr_mode", False) and successful: - print("\n" + "=" * 60) - print(" PR Comment Markdown ".center(60)) - print("=" * 60) - print("\n## WAA Screenshots\n") - print("The following screenshots demonstrate WAA is working:\n") - - for target in successful: - info = TARGETS[target] - path = results[target] - # Use relative path for GitHub - try: - rel_path = Path(path).relative_to(PROJECT_ROOT) - except ValueError: - rel_path = path - print(f"### {info['description']}\n") - print(f"![{target}]({rel_path})\n") - - print("\n---") - print("(Copy the markdown above to add to your PR)") - - print() - - -def cmd_setup(args: argparse.Namespace) -> None: - """Run full setup (Azure + WAA submodule).""" - import subprocess - - print("\n=== OpenAdapt-ML WAA Setup ===\n") - - # Step 1: Git submodule - print("[1/2] Checking WAA submodule...") - waa_path = find_waa_path() - if waa_path: - print(f" WAA already available at: {waa_path}") - else: - print(" Initializing WAA submodule...") - try: - subprocess.run( - ["git", "submodule", "update", "--init", "--recursive"], - check=True, - capture_output=not args.verbose, - ) - print(" WAA submodule initialized") - except subprocess.CalledProcessError as e: - print(f" Failed: {e}") - if not args.force: - sys.exit(1) - - # Step 2: Azure setup - print("\n[2/2] Azure setup...") - setup_script = Path(__file__).parent.parent.parent / "scripts" / "setup_azure.py" - if setup_script.exists(): - cmd = ["python", str(setup_script)] - if args.yes: - cmd.append("--yes") - try: - subprocess.run(cmd, check=True) - except subprocess.CalledProcessError: - print(" Azure setup failed or was cancelled") - if not args.force: - sys.exit(1) - else: - print(f" Setup script not found: {setup_script}") - print(" Run manually: python scripts/setup_azure.py") - - print("\n=== Setup Complete ===") - print("\nNext steps:") - print(" 1. Check status: python -m openadapt_ml.benchmarks.cli status") - print(" 2. Test locally: python -m openadapt_ml.benchmarks.cli test-mock") - print(" 3. Run on Azure: python -m openadapt_ml.benchmarks.cli run-azure") - print() - - -def cmd_waa(args: argparse.Namespace) -> None: - """One-command WAA benchmark setup and execution using waa-auto. - - This command handles everything needed to run WAA benchmarks: - 1. Creates Azure VM if not exists - 2. Sets up Docker with proper disk configuration - 3. Builds waa-auto Docker image (dockurr/windows + WAA components) - 4. Starts Windows container (auto-boots Windows 11, installs WAA server) - 5. Waits for WAA server to be ready - 6. Optionally runs benchmark tasks - - The command is idempotent - safe to run multiple times. - Uses dockurr/windows base with automatic Windows 11 download (VERSION=11e). - - Usage: - # Full setup + run benchmark - uv run python -m openadapt_ml.benchmarks.cli waa --api-key $OPENAI_API_KEY - - # Just setup (no benchmark run) - uv run python -m openadapt_ml.benchmarks.cli waa --api-key $OPENAI_API_KEY --setup-only - - # Force re-pull of Docker image - uv run python -m openadapt_ml.benchmarks.cli waa --api-key $OPENAI_API_KEY --rebuild - - # Fresh install (delete Windows storage) - uv run python -m openadapt_ml.benchmarks.cli waa --api-key $OPENAI_API_KEY --fresh - """ - import subprocess - import time - import webbrowser - import threading - - resource_group = args.resource_group - vm_name = args.name - location = args.location - - # Get API key - api_key = args.api_key or settings.openai_api_key or os.environ.get("OPENAI_API_KEY", "") - if not api_key: - print("ERROR: OpenAI API key required.") - print(" Set with --api-key, OPENAI_API_KEY env var, or in .env file") - sys.exit(1) - - print("\n" + "=" * 60) - print(" WAA Benchmark - waa-auto (dockurr/windows + WAA)") - print("=" * 60) - print() - print("This will:") - print(" 1. Create/verify Azure VM with nested virtualization") - print(" 2. Install/verify Docker with /mnt storage (300GB)") - print(" 3. Build waa-auto Docker image (auto-downloads Windows 11)") - print(" 4. Start Windows container (boots Windows, installs WAA)") - print(" 5. Wait for WAA server to be ready") - if not args.setup_only: - print(f" 6. Run benchmark with {args.num_tasks} tasks") - print() + try: + # Keep tunnel alive + tunnel_proc.wait() + except KeyboardInterrupt: + print("\nClosing SSH tunnel...") + tunnel_proc.terminate() - # Track overall progress - total_steps = 6 if not args.setup_only else 5 - current_step = 0 + return 0 - def step(msg: str) -> None: - nonlocal current_step - current_step += 1 - print(f"\n[{current_step}/{total_steps}] {msg}") - # ======================================== - # Step 1: Create/verify Azure VM - # ======================================== - step("Creating/verifying Azure VM...") +def _show_run_logs(ip: str, follow: bool = False, tail: Optional[int] = None) -> int: + """Show the most recent run command log file. - ip = get_vm_ip(resource_group, vm_name) - if ip: - print(f" VM already exists: {ip}") - else: - if args.fresh: - # Delete existing VM first - print(" --fresh flag: Deleting existing VM...") - subprocess.run( - ["az", "vm", "delete", "-g", resource_group, "-n", vm_name, "-y"], - capture_output=True, text=True, - ) + Args: + ip: VM IP address + follow: If True, use tail -f to stream the log + tail: Number of lines to show (default: entire file or 100 for follow) - # Always clean up leftover resources before creating VM - # This prevents failures from orphaned VNETs, NICs, NSGs, PublicIPs, disks - cleanup_waa_resources(resource_group, vm_name) - - print(" Creating new VM (this takes 2-3 minutes)...") - # Try multiple sizes (in case quota is unavailable) and locations - # D8ds_v5: 300GB temp, best for WAA. D8s_v3: 64GB temp, fallback. - sizes_to_try = ["Standard_D8ds_v5", "Standard_D8s_v3", "Standard_D4ds_v4"] - locations_to_try = [location, "westus2", "centralus", "eastus2"] - - vm_created = False - last_error = "" - for size in sizes_to_try: - if vm_created: - break - for loc in locations_to_try: - result = subprocess.run( - [ - "az", "vm", "create", - "--resource-group", resource_group, - "--name", vm_name, - "--location", loc, - "--image", "Ubuntu2204", - "--size", size, - "--admin-username", "azureuser", - "--generate-ssh-keys", - "--public-ip-sku", "Standard", - ], - capture_output=True, text=True, - ) - if result.returncode == 0: - vm_info = json.loads(result.stdout) - ip = vm_info.get("publicIpAddress", "") - print(f" VM created: {size} in {loc}, IP: {ip}") - vm_created = True - break - else: - last_error = result.stderr[:200] - # Check if it's a quota error vs location error - if "quota" in result.stderr.lower() or "limit" in result.stderr.lower(): - print(f" {size}: quota unavailable, trying smaller size...") - break # Try next size - else: - print(f" {size} in {loc}: unavailable, trying next...") - - if not vm_created: - print("ERROR: Could not create VM with any size/region combination") - print(f" Last error: {last_error}") - print("\n Try requesting quota increase for DDSv5 family in Azure portal.") - sys.exit(1) - - # ======================================== - # Step 2: Install/verify Docker - # ======================================== - step("Setting up Docker with /mnt storage...") - - # Check if Docker is already configured correctly - check_docker = subprocess.run( - ssh_cmd(ip, "docker info 2>/dev/null | grep -q 'Docker Root Dir: /mnt/docker' && echo OK"), - capture_output=True, text=True, timeout=30, + Returns: + Exit code (0 for success, 1 for error) + """ + # Find the most recent run log file + result = ssh_run( + ip, "ls -t /home/azureuser/cli_logs/run_*.log 2>/dev/null | head -1" ) + log_file = result.stdout.strip() + + if not log_file: + print("No run logs found at /home/azureuser/cli_logs/run_*.log") + print("Run a benchmark first: cli_v2 run --task ") + return 1 + + print(f"Run log: {log_file}") + print("-" * 60) - if "OK" in check_docker.stdout: - print(" Docker already configured correctly") + if follow: + # Stream the log file + print("Streaming log (Ctrl+C to stop)...") + subprocess.run(["ssh", *SSH_OPTS, f"azureuser@{ip}", f"tail -f {log_file}"]) else: - print(" Installing Docker with /mnt storage (300GB)...") - docker_cmds = [ - "sudo apt-get update -qq", - "sudo apt-get install -y -qq docker.io", - "sudo systemctl start docker", - "sudo systemctl enable docker", - "sudo usermod -aG docker $USER", - "sudo systemctl stop docker", - "sudo mkdir -p /mnt/docker", - # Configure Docker to use /mnt and enable BuildKit with cache limits - 'echo \'{"data-root": "/mnt/docker", "features": {"buildkit": true}}\' | sudo tee /etc/docker/daemon.json', - # Configure BuildKit garbage collection (30GB max cache) - "sudo mkdir -p /etc/buildkit", - 'echo \'[worker.oci]\\n gc = true\\n gckeepstorage = 30000000000\\n[[worker.oci.gcpolicy]]\\n keepBytes = 30000000000\\n keepDuration = 172800\\n filters = ["type==source.local", "type==exec.cachemount", "type==source.git.checkout"]\' | sudo tee /etc/buildkit/buildkitd.toml', - "sudo systemctl start docker", - ] - result = subprocess.run( - ssh_cmd(ip, " && ".join(docker_cmds)), - capture_output=True, text=True, timeout=180, - ) - if result.returncode != 0: - print(f" WARNING: Docker setup may have issues: {result.stderr[:200]}") + # Show the log file contents + if tail: + cmd = f"tail -n {tail} {log_file}" else: - print(" Docker installed with /mnt storage") + # Check file size first - if small, cat it; if large, use tail + size_result = ssh_run(ip, f"wc -l < {log_file}") + line_count = ( + int(size_result.stdout.strip()) + if size_result.stdout.strip().isdigit() + else 0 + ) - # Verify nested virtualization - virt_check = subprocess.run( - ssh_cmd(ip, "egrep -c '(vmx|svm)' /proc/cpuinfo"), - capture_output=True, text=True, timeout=30, - ) - cpu_count = virt_check.stdout.strip() - if cpu_count and int(cpu_count) > 0: - print(f" Nested virtualization: OK ({cpu_count} CPUs with vmx/svm)") - else: - print(" ERROR: Nested virtualization not supported - WAA won't work") - print(" Make sure VM size is Standard_D8ds_v5 or similar v5 series") - sys.exit(1) - - # ======================================== - # Step 3: Build waa-auto Docker image - # ======================================== - step("Building waa-auto Docker image...") - - # Check if waa-auto image exists - check_image = subprocess.run( - ssh_cmd(ip, "docker images waa-auto:latest --format '{{.ID}}' | head -1"), - capture_output=True, text=True, timeout=30, - ) - waa_auto_exists = bool(check_image.stdout.strip()) + if line_count <= 200: + cmd = f"cat {log_file}" + else: + print( + f"(Showing last 100 of {line_count} lines, use --tail N for more)" + ) + cmd = f"tail -n 100 {log_file}" - if args.rebuild: - print(" --rebuild flag: Forcing image rebuild...") - waa_auto_exists = False + subprocess.run(["ssh", *SSH_OPTS, f"azureuser@{ip}", cmd]) - if waa_auto_exists: - print(" waa-auto image already exists") - else: - print(" Building waa-auto image (dockurr/windows + WAA components)...") - print(" (This may take 10-15 minutes on first run)") + return 0 - # Find the Dockerfile in our repo - dockerfile_path = Path(__file__).parent / "waa_deploy" / "Dockerfile" - if not dockerfile_path.exists(): - print(f" ERROR: Dockerfile not found at: {dockerfile_path}") - sys.exit(1) - # Copy Dockerfile and support files to VM - build_dir = "/tmp/waa-build" - subprocess.run( - ssh_cmd(ip, f"mkdir -p {build_dir}"), - capture_output=True, text=True, timeout=30, - ) +def cmd_logs(args): + """Show comprehensive logs from the WAA container. - # Copy files using scp - for filename in ["Dockerfile", "api_agent.py", "start_waa_server.bat"]: - src = Path(__file__).parent / "waa_deploy" / filename - if src.exists(): - subprocess.run( - ["scp", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", - str(src), f"azureuser@{ip}:{build_dir}/"], - capture_output=True, text=True, timeout=60, - ) + Default behavior shows all relevant logs (docker, storage, probe status). + Use --follow to stream docker logs continuously. + Use --run to show run command output instead of container logs. + """ + ip = get_vm_ip() + if not ip: + print("ERROR: VM not found") + return 1 + + # Handle --run flag: show run command output + if args.run: + return _show_run_logs(ip, args.follow, args.tail) + + # Check if container exists + result = ssh_run(ip, "docker ps -a --filter name=winarena --format '{{.Status}}'") + container_status = result.stdout.strip() + container_exists = bool(container_status) + + # If --follow, stream the most relevant logs + if args.follow: + # Priority 1: If container is running, stream container logs + if container_exists and "Up" in container_status: + print(f"Streaming container logs from VM ({ip}):") + print("Press Ctrl+C to stop") + print("-" * 60) + subprocess.run( + ["ssh", *SSH_OPTS, f"azureuser@{ip}", "docker logs -f winarena 2>&1"] + ) + return 0 + + # Priority 2: Check for active docker build + result = ssh_run( + ip, + "pgrep -f 'docker build' >/dev/null && echo BUILD_RUNNING || echo NO_BUILD", + ) + if "BUILD_RUNNING" in result.stdout: + print(f"Docker build in progress on VM ({ip})") + print("Streaming build logs (Ctrl+C to stop):") + print("-" * 60) + # Find and tail the most recent build log + subprocess.run( + [ + "ssh", + *SSH_OPTS, + f"azureuser@{ip}", + "tail -f $(ls -t ~/cli_logs/build_*.log 2>/dev/null | head -1) 2>/dev/null || " + "tail -f ~/build.log 2>/dev/null || " + "echo 'No build logs found - build may have just started'", + ] + ) + return 0 - # Build the image - build_process = subprocess.Popen( - ssh_cmd(ip, f"cd {build_dir} && docker build -t waa-auto:latest . 2>&1"), - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, + # Priority 3: No container, no build - show helpful message + print(f"Container 'winarena' not running on VM ({ip})") + print() + # Check if image exists + result = ssh_run( + ip, "docker images waa-auto:latest --format '{{.Repository}}:{{.Tag}}'" ) + if result.stdout.strip(): + print("Image 'waa-auto:latest' is ready.") + print("Run: uv run python -m openadapt_ml.benchmarks.cli_v2 start") + else: + print("Image not yet built.") + print("Run: uv run python -m openadapt_ml.benchmarks.cli_v2 build") + return 1 - for line in build_process.stdout: - line = line.rstrip() - # Show progress lines - if any(x in line.lower() for x in ["step", "pulling", "download", "extract", "complete", "error", "successfully"]): - print(f" {line[-100:]}", flush=True) - - build_process.wait() - if build_process.returncode != 0: - print(" ERROR: Docker build failed") - sys.exit(1) - print(" waa-auto image built successfully") + # Default: show comprehensive status + import sys - # ======================================== - # Step 4: Start Windows container - # ======================================== - step("Starting Windows container...") + print(f"WAA Status ({ip})") + print("=" * 60) + sys.stdout.flush() - # Stop any existing container + # Docker images + print("\n[Docker Images]", flush=True) subprocess.run( - ssh_cmd(ip, "docker stop winarena 2>/dev/null; docker rm -f winarena 2>/dev/null"), - capture_output=True, text=True, timeout=30, + [ + "ssh", + *SSH_OPTS, + f"azureuser@{ip}", + "docker images --format 'table {{.Repository}}\\t{{.Tag}}\\t{{.Size}}' 2>/dev/null | head -5", + ] ) - # Handle --fresh flag - if args.fresh: - print(" --fresh flag: Deleting Windows storage...") + # Container status + print("\n[Container]", flush=True) + if container_exists: + print(f" Status: {container_status}", flush=True) + else: + print(" Container 'winarena' not created yet", flush=True) + # Check for active build + result = ssh_run( + ip, + "pgrep -f 'docker build' >/dev/null && echo BUILD_RUNNING || echo NO_BUILD", + ) + if "BUILD_RUNNING" in result.stdout: + print(" Docker build in progress...", flush=True) + + # Only show these sections if container exists + if container_exists and "Up" in container_status: + # Storage info + print("\n[Storage]", flush=True) + subprocess.run( + [ + "ssh", + *SSH_OPTS, + f"azureuser@{ip}", + "docker exec winarena sh -c '" + 'echo " Total: $(du -sh /storage/ 2>/dev/null | cut -f1)"; ' + 'ls -lh /storage/*.img 2>/dev/null | awk "{print \\" Disk image: \\" \\$5}" || true' + "'", + ] + ) + + # QEMU VM status + print("\n[QEMU VM]", flush=True) subprocess.run( - ssh_cmd(ip, "sudo rm -rf /data/waa-storage/* 2>/dev/null || true"), - capture_output=True, text=True, timeout=30, + [ + "ssh", + *SSH_OPTS, + f"azureuser@{ip}", + "docker exec winarena sh -c '" + "QPID=$(pgrep -f qemu-system 2>/dev/null | head -1); " + 'if [ -n "$QPID" ]; then ' + ' echo " Status: Running (PID $QPID)"; ' + ' ps -o %cpu,%mem,etime -p $QPID 2>/dev/null | tail -1 | awk "{print \\" CPU: \\" \\$1 \\"%, MEM: \\" \\$2 \\"%, Uptime: \\" \\$3}"; ' + "else " + ' echo " Status: Not running"; ' + "fi" + "'", + ] ) - # Ensure storage directory exists - subprocess.run( - ssh_cmd(ip, "sudo mkdir -p /data/waa-storage && sudo chown azureuser:azureuser /data/waa-storage"), - capture_output=True, text=True, timeout=30, - ) + # WAA server probe + print("\n[WAA Server]", flush=True) + subprocess.run( + [ + "ssh", + *SSH_OPTS, + f"azureuser@{ip}", + "docker exec winarena curl -s --max-time 5 http://172.30.0.2:5000/probe 2>/dev/null && echo ' (READY)' || echo 'Not ready (Windows installing - check VNC for progress)'", + ] + ) - # Start the container using waa-auto (dockurr/windows base + WAA components) - # This uses dockurr/windows entry.sh for Windows boot, not WAA's entry.sh - docker_run_cmd = f"""docker run -d --name winarena \\ - --device=/dev/kvm \\ - --cap-add NET_ADMIN \\ - --stop-timeout 120 \\ - -p 8006:8006 -p 3389:3389 -p 5000:5000 \\ - -v /data/waa-storage:/storage \\ - -e VERSION=11e \\ - -e RAM_SIZE=12G \\ - -e CPU_CORES=4 \\ - -e OPENAI_API_KEY='{api_key}' \\ - waa-auto:latest""" + # Windows install log (written by install.bat to Samba share at Z:\install_log.txt) + # The Samba share \\host.lan\Data maps to /tmp/smb inside the container + result = ssh_run( + ip, "docker exec winarena cat /tmp/smb/install_log.txt 2>/dev/null | wc -l" + ) + install_log_lines = result.stdout.strip() + if install_log_lines and install_log_lines != "0": + print("\n[Windows Install Log]", flush=True) + # Show last 10 lines of the install log (shows current step like [5/14] Installing Git...) + subprocess.run( + [ + "ssh", + *SSH_OPTS, + f"azureuser@{ip}", + "docker exec winarena tail -10 /tmp/smb/install_log.txt 2>/dev/null", + ] + ) - result = subprocess.run( - ssh_cmd(ip, docker_run_cmd), - capture_output=True, text=True, timeout=60, - ) - if result.returncode != 0: - print(f" ERROR: Failed to start container: {result.stderr[:200]}") - sys.exit(1) - print(" Container started") - - # ======================================== - # Step 5: Wait for WAA server - # ======================================== - step("Waiting for WAA server to be ready...") - print(" (Windows boots in 2-3 min if cached, 15-20 min on first run)") - print(" VNC available at: http://localhost:8006 (via SSH tunnel)") - print(f" Start SSH tunnel: ssh -fN -L 8006:localhost:8006 azureuser@{ip}") - - # Note: We don't auto-open browser for VNC because SSH tunnel must be started first - # User should use 'vm monitor' which handles tunnels automatically - if args.open: - print(" Note: --open ignored for VNC. Use 'vm monitor' to auto-manage tunnels.") - - # Poll for WAA server readiness - max_wait_minutes = 25 - poll_interval = 15 - max_attempts = (max_wait_minutes * 60) // poll_interval - - for attempt in range(max_attempts): - is_ready, response = check_waa_probe(ip, timeout=5, internal_ip="172.30.0.2") - if is_ready: - print(f"\n WAA server is ready!") - break + # Recent docker logs + tail_lines = args.tail if args.tail else 20 + print(f"\n[Recent Logs (last {tail_lines} lines)]", flush=True) + print("-" * 60, flush=True) + subprocess.run( + [ + "ssh", + *SSH_OPTS, + f"azureuser@{ip}", + f"docker logs --tail {tail_lines} winarena 2>&1", + ] + ) - elapsed = (attempt + 1) * poll_interval - elapsed_min = elapsed // 60 - elapsed_sec = elapsed % 60 - print(f" Attempt {attempt + 1}/{max_attempts}: Not ready yet ({elapsed_min}m {elapsed_sec}s elapsed)") - time.sleep(poll_interval) + print("\n" + "=" * 60, flush=True) + print("VNC: ssh -L 8006:localhost:8006 azureuser@" + ip, flush=True) + print(" Then open http://localhost:8006", flush=True) + print(" (Windows installation % visible on VNC screen)", flush=True) else: - print(f"\n WARNING: WAA server not responding after {max_wait_minutes} minutes") - print(" Check VNC at http://localhost:8006 (via SSH tunnel) for Windows installation status") - if args.setup_only: - sys.exit(0) # Setup is complete even if server not ready yet + # Show next steps + print("\n[Next Steps]") + result = ssh_run(ip, "docker images waa-auto:latest --format '{{.Repository}}'") + if result.stdout.strip(): + print(" Image ready. Run: cli_v2 start") else: - sys.exit(1) - - # ======================================== - # Step 6: Run benchmark (if not --setup-only) - # ======================================== - if not args.setup_only: - step(f"Running benchmark with {args.num_tasks} tasks...") - - # Run benchmark using navi agent INSIDE the container - # The client code is at /client in the waa-auto container - # Must use -w /client to set working directory (settings.json is there) - # som_origin options: 'oss' (default), 'a11y', 'mixed-oss', 'omni', 'mixed-omni' - run_cmd = f"""docker exec -w /client -e OPENAI_API_KEY='{api_key}' winarena \\ - python run.py \\ - --model {args.model} \\ - --agent navi \\ - --num_tasks {args.num_tasks} \\ - --som_origin oss \\ - --result_dir results 2>&1""" - - print(f" Model: {args.model}") - print(f" Tasks: {args.num_tasks}") - print() + print(" Build image first. Run: cli_v2 build") - # Stream output - run_process = subprocess.Popen( - ssh_cmd(ip, run_cmd), - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, - ) - - for line in run_process.stdout: - print(f" {line.rstrip()}", flush=True) + return 0 - run_process.wait() - if run_process.returncode != 0: - print("\n Benchmark run had errors (see output above)") - else: - print("\n Benchmark completed!") - # ======================================== - # Done - # ======================================== - print("\n" + "=" * 60) - print(" WAA Setup Complete!") - print("=" * 60) - print() - print(f" VM IP: {ip}") - print(" VNC: http://localhost:8006 (via SSH tunnel)") - print() - print(" Next steps:") - print(" # Monitor VM and manage SSH tunnels (RECOMMENDED - auto-manages tunnels):") - print(" uv run python -m openadapt_ml.benchmarks.cli vm monitor") - print() - print(" # Run more benchmark tasks:") - print(f" uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 20") - print() - print(" # Deallocate VM when done (stops billing):") - print(" uv run python -m openadapt_ml.benchmarks.cli vm deallocate -y") - print() +# ============================================================================= +# Main +# ============================================================================= -def main() -> None: +def main(): parser = argparse.ArgumentParser( - description="WAA Benchmark CLI - Windows Agent Arena evaluation toolkit", + description="WAA Benchmark CLI v2 - Minimal working CLI", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" -Quick Start: - # First time setup (Azure + WAA submodule) - python -m openadapt_ml.benchmarks.cli setup - - # Check everything is configured - python -m openadapt_ml.benchmarks.cli status - - # Test locally with mock adapter - python -m openadapt_ml.benchmarks.cli test-mock +Examples: + # Full setup workflow (vanilla WAA) + %(prog)s create # Create Azure VM + %(prog)s pull # Pull vanilla WAA image + %(prog)s start # Start container + Windows + %(prog)s probe --wait # Wait for WAA server + %(prog)s run --num-tasks 1 --agent navi # Run benchmark + %(prog)s deallocate # Stop billing - # Run on Azure - python -m openadapt_ml.benchmarks.cli run-azure - """, - ) - subparsers = parser.add_subparsers(dest="command", help="Command to run") + # Monitor in separate terminal + %(prog)s logs --docker # Docker container logs + %(prog)s vnc # View Windows desktop - # Setup (new!) - p_setup = subparsers.add_parser("setup", help="One-command setup (Azure + WAA)") - p_setup.add_argument( - "--yes", "-y", action="store_true", help="Skip confirmation prompts" + # Cleanup + %(prog)s delete +""", ) - p_setup.add_argument("--force", action="store_true", help="Continue on errors") - p_setup.add_argument("--verbose", "-v", action="store_true", help="Verbose output") - - # WAA - One command to setup and run WAA benchmarks - p_waa = subparsers.add_parser( - "waa", - help="One-command WAA benchmark setup using vanilla Microsoft WAA", - description=""" -One-command WAA benchmark setup and execution using vanilla Microsoft WAA. - -This command handles everything needed to run WAA benchmarks: - 1. Creates Azure VM if not exists - 2. Sets up Docker with proper disk configuration - 3. Pulls the official windowsarena/winarena Docker image - 4. Starts Windows container with entry.sh (auto-boots Windows, starts server) - 5. Waits for WAA server to be ready - 6. Optionally runs benchmark tasks - -The command is idempotent - safe to run multiple times. -Uses Microsoft's vanilla WAA scripts (no custom Dockerfile). - -Examples: - # Full setup + run benchmark - uv run python -m openadapt_ml.benchmarks.cli waa --api-key $OPENAI_API_KEY - - # Just setup (no benchmark run) - uv run python -m openadapt_ml.benchmarks.cli waa --api-key $OPENAI_API_KEY --setup-only - # Run 20 tasks - uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 20 + subparsers = parser.add_subparsers(dest="command", required=True) - # Force re-pull of Docker image - uv run python -m openadapt_ml.benchmarks.cli waa --rebuild + # create + p_create = subparsers.add_parser("create", help="Create Azure VM") + p_create.set_defaults(func=cmd_create) - # Fresh install (delete VM and Windows storage) - uv run python -m openadapt_ml.benchmarks.cli waa --fresh - """, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - p_waa.add_argument( - "--api-key", - help="OpenAI API key (or set OPENAI_API_KEY env var)", - ) - p_waa.add_argument( - "--num-tasks", - type=int, - default=5, - help="Number of benchmark tasks to run (default: 5)", - ) - p_waa.add_argument( - "--model", - default="gpt-4o", - help="OpenAI model to use (default: gpt-4o)", - ) - p_waa.add_argument( - "--setup-only", - action="store_true", - help="Only setup VM/Docker/image, don't run benchmark", - ) - p_waa.add_argument( - "--rebuild", - action="store_true", - help="Force re-pull of windowsarena/winarena Docker image", - ) - p_waa.add_argument( - "--fresh", - action="store_true", - help="Delete VM and Windows storage, start fresh", - ) - p_waa.add_argument( - "--open", - action="store_true", - default=True, - help="Open VNC in browser when ready (default: True)", - ) - p_waa.add_argument( - "--no-open", - action="store_false", - dest="open", - help="Don't open VNC in browser", - ) - p_waa.add_argument( - "--resource-group", - default="openadapt-agents", - help="Azure resource group (default: openadapt-agents)", - ) - p_waa.add_argument( - "--name", - default="waa-eval-vm", - help="VM name (default: waa-eval-vm)", - ) - p_waa.add_argument( - "--location", - default="eastus", - help="Azure region (default: eastus)", - ) + # delete + p_delete = subparsers.add_parser("delete", help="Delete VM and all resources") + p_delete.set_defaults(func=cmd_delete) - # Status - p_status = subparsers.add_parser("status", help="Check Azure and WAA status") - p_status.add_argument("--verbose", "-v", action="store_true", help="Verbose output") + # status + p_status = subparsers.add_parser("status", help="Show VM status") + p_status.set_defaults(func=cmd_status) - # Az-status (lightweight, no Azure SDK) - p_az_status = subparsers.add_parser( - "az-status", help="Check Azure resource status (uses az CLI)" - ) - p_az_status.add_argument( - "--resource-group", default="openadapt-agents", help="Azure resource group name" - ) - p_az_status.add_argument( - "--workspace", default="openadapt-ml", help="Azure ML workspace name" - ) - p_az_status.add_argument( - "--acr-name", default="openadaptacr", help="Azure Container Registry name" + # build + p_build = subparsers.add_parser( + "build", help="Build WAA image from waa_deploy/Dockerfile" ) + p_build.set_defaults(func=cmd_build) - # Cleanup - p_cleanup = subparsers.add_parser( - "cleanup", help="Delete all Azure compute instances" - ) - p_cleanup.add_argument("--yes", "-y", action="store_true", help="Skip confirmation") - p_cleanup.add_argument( - "--verbose", "-v", action="store_true", help="Verbose output" + # start + p_start = subparsers.add_parser("start", help="Start WAA container") + p_start.add_argument( + "--fresh", action="store_true", help="Clean storage for fresh Windows install" ) + p_start.set_defaults(func=cmd_start) - # Estimate costs - p_estimate = subparsers.add_parser("estimate", help="Estimate Azure costs") - p_estimate.add_argument("--tasks", type=int, default=154, help="Number of tasks") - p_estimate.add_argument( - "--workers", - type=int, - default=1, - help="Number of workers (default: 1 for free trial)", - ) - p_estimate.add_argument( - "--duration", type=float, default=1.0, help="Avg task duration (minutes)" - ) - p_estimate.add_argument( - "--vm-cost", type=float, default=0.19, help="VM hourly cost ($ for D4_v3)" + # stop + p_stop = subparsers.add_parser("stop", help="Stop and remove WAA container") + p_stop.add_argument( + "--clean", action="store_true", help="Also clean Windows storage" ) + p_stop.set_defaults(func=cmd_stop) - # Run local - p_local = subparsers.add_parser( - "run-local", help="Run evaluation locally (Windows)" - ) - p_local.add_argument( - "--waa-path", help="Path to WAA repository (auto-detected if not specified)" + # probe + p_probe = subparsers.add_parser("probe", help="Check if WAA server is ready") + p_probe.add_argument("--wait", action="store_true", help="Wait until ready") + p_probe.add_argument( + "--timeout", type=int, default=1200, help="Timeout in seconds (default: 1200)" ) - p_local.add_argument("--tasks", help="Comma-separated task IDs (default: all)") - p_local.add_argument("--max-steps", type=int, default=15, help="Max steps per task") - p_local.add_argument("--agent", default="random", help="Agent type") - p_local.add_argument("--seed", type=int, default=42, help="Random seed") - p_local.add_argument("--output", help="Output JSON path") - p_local.add_argument( - "--force", action="store_true", help="Force run on non-Windows" - ) - p_local.add_argument("--verbose", "-v", action="store_true", help="Verbose output") + p_probe.set_defaults(func=cmd_probe) - # Run Azure - p_azure = subparsers.add_parser("run-azure", help="Run evaluation on Azure") - p_azure.add_argument("--config", help="Azure config JSON path") - p_azure.add_argument( - "--waa-path", help="Path to WAA repository (auto-detected if not specified)" + # run + p_run = subparsers.add_parser( + "run", help="Run benchmark tasks (uses vanilla WAA navi agent)" ) - p_azure.add_argument( - "--workers", + p_run.add_argument( + "--num-tasks", type=int, default=1, - help="Number of workers (default: 1 for free trial)", - ) - p_azure.add_argument( - "--num-tasks", type=int, help="Number of random tasks to run (default: all)" - ) - p_azure.add_argument("--task-ids", help="Comma-separated specific task IDs to run") - p_azure.add_argument("--max-steps", type=int, default=15, help="Max steps per task") - p_azure.add_argument("--agent", default="random", help="Agent type") - p_azure.add_argument("--seed", type=int, default=42, help="Random seed") - p_azure.add_argument("--experiment", default="waa-eval", help="Experiment name") - p_azure.add_argument("--output", help="Output JSON path") - p_azure.add_argument("--yes", "-y", action="store_true", help="Skip confirmation") - p_azure.add_argument( - "--no-cleanup", action="store_true", help="Don't delete VMs after" - ) - p_azure.add_argument( - "--timeout", - type=float, - default=4.0, - help="Job timeout in hours (default: 4). Jobs are auto-canceled after this duration.", - ) - p_azure.add_argument("--verbose", "-v", action="store_true", help="Verbose output") - - # Test mock - p_mock = subparsers.add_parser("test-mock", help="Test with mock adapter") - p_mock.add_argument("--tasks", type=int, default=20, help="Number of mock tasks") - p_mock.add_argument("--max-steps", type=int, default=10, help="Max steps per task") - p_mock.add_argument("--seed", type=int, default=42, help="Random seed") - - # Test smart (SmartMockAgent - expected 100% success) - p_smart = subparsers.add_parser( - "test-smart", - help="Test mock adapter with SmartMockAgent (expected 100%% success)", - ) - p_smart.add_argument("--tasks", type=int, default=5, help="Number of mock tasks") - p_smart.add_argument("--max-steps", type=int, default=10, help="Max steps per task") - - # Test collection - p_collection = subparsers.add_parser( - "test-collection", help="Test benchmark data collection" - ) - p_collection.add_argument( - "--tasks", type=int, default=5, help="Number of mock tasks (default: 5)" - ) - p_collection.add_argument( - "--max-steps", type=int, default=10, help="Max steps per task (default: 10)" - ) - p_collection.add_argument("--seed", type=int, default=42, help="Random seed") - p_collection.add_argument( - "--model-id", default="random-agent-test", help="Model identifier" - ) - p_collection.add_argument( - "--output", default="benchmark_results", help="Output directory" - ) - p_collection.add_argument("--run-name", help="Run name (default: auto-generated)") - - # Run API-backed evaluation - p_api = subparsers.add_parser( - "run-api", help="Run evaluation with API-backed VLM (Claude/GPT-5.1)" - ) - p_api.add_argument( - "--provider", - choices=["anthropic", "openai"], - default="anthropic", - help="API provider (anthropic=Claude, openai=GPT-5.1)", - ) - p_api.add_argument( - "--tasks", type=int, default=5, help="Number of mock tasks (default: 5)" - ) - p_api.add_argument( - "--max-steps", type=int, default=10, help="Max steps per task (default: 10)" - ) - p_api.add_argument( - "--max-tokens", type=int, default=512, help="Max tokens for API response" - ) - p_api.add_argument( - "--no-a11y", action="store_true", help="Disable accessibility tree in prompt" - ) - p_api.add_argument( - "--no-history", action="store_true", help="Disable action history in prompt" - ) - p_api.add_argument("--output", default="benchmark_results", help="Output directory") - p_api.add_argument("--run-name", help="Run name (default: auto-generated)") - p_api.add_argument("--model-id", help="Model identifier (default: {provider}-api)") - p_api.add_argument( - "--mock", - action="store_true", - help="Force use of mock adapter (even if WAA is available)", - ) - p_api.add_argument( - "--waa-path", help="Path to WAA repository (auto-detected if not specified)" - ) - p_api.add_argument("--task-ids", help="Comma-separated task IDs for real WAA") - p_api.add_argument("--force", action="store_true", help="Force run on non-Windows") - p_api.add_argument("--verbose", "-v", action="store_true", help="Verbose output") - - # WAA Demo-conditioned experiment - p_demo = subparsers.add_parser( - "waa-demo", help="Run WAA demo-conditioned experiment (zero-shot vs demo)" - ) - p_demo.add_argument( - "--condition", - choices=["zero-shot", "demo"], - default="demo", - help="Experiment condition (default: demo)", - ) - p_demo.add_argument( - "--provider", - choices=["anthropic", "openai"], - default="anthropic", - help="VLM API provider (default: anthropic)", - ) - p_demo.add_argument( - "--tasks", help="Comma-separated task numbers 1-10 (default: all with demos)" + help="Number of tasks to run (ignored if --task specified)", ) - p_demo.add_argument( - "--max-steps", type=int, default=15, help="Max steps per task (default: 15)" - ) - p_demo.add_argument( - "--max-tokens", type=int, default=512, help="Max tokens for API response" - ) - p_demo.add_argument( - "--mock", action="store_true", help="Use mock adapter (no Windows required)" - ) - p_demo.add_argument( - "--no-a11y", action="store_true", help="Disable accessibility tree in prompt" - ) - p_demo.add_argument( - "--no-history", action="store_true", help="Disable action history in prompt" - ) - p_demo.add_argument( - "--output", default="benchmark_results", help="Output directory" - ) - p_demo.add_argument("--run-name", help="Run name (default: auto-generated)") - p_demo.add_argument("--verbose", "-v", action="store_true", help="Verbose output") - - # Create config - p_config = subparsers.add_parser("create-config", help="Create sample Azure config") - p_config.add_argument("--output", default="azure_config.json", help="Output path") - - # Cleanup VMs (frees quota) - p_cleanup_vms = subparsers.add_parser( - "cleanup-vms", help="Clean up Azure compute instances to free quota" - ) - p_cleanup_vms.add_argument( - "--resource-group", default="openadapt-agents", help="Azure resource group" - ) - p_cleanup_vms.add_argument( - "--workspace", default="openadapt-ml", help="Azure ML workspace name" - ) - p_cleanup_vms.add_argument( - "--yes", "-y", action="store_true", help="Skip confirmation" + p_run.add_argument("--task", help="Specific task ID to run") + p_run.add_argument( + "--domain", + default="all", + help="Domain filter (e.g., 'notepad', 'chrome', 'all')", ) - - # List jobs - p_list_jobs = subparsers.add_parser("list-jobs", help="List recent Azure ML jobs") - p_list_jobs.add_argument( - "--resource-group", default="openadapt-agents", help="Azure resource group" + p_run.add_argument( + "--model", default="gpt-4o", help="Model for navi agent (default: gpt-4o)" ) - p_list_jobs.add_argument( - "--workspace", default="openadapt-ml", help="Azure ML workspace name" + p_run.add_argument( + "--api-key", help="OpenAI API key (or set OPENAI_API_KEY in .env)" ) - p_list_jobs.add_argument( - "--limit", type=int, default=20, help="Max number of jobs to show" + p_run.add_argument( + "--no-download", action="store_true", help="Skip downloading results" ) + p_run.set_defaults(func=cmd_run) - # Job logs - p_job_logs = subparsers.add_parser( - "job-logs", help="Download and display logs for an Azure ML job" - ) - p_job_logs.add_argument("job_name", help="Job name (from list-jobs output)") - p_job_logs.add_argument( - "--resource-group", default="openadapt-agents", help="Azure resource group" - ) - p_job_logs.add_argument( - "--workspace", default="openadapt-ml", help="Azure ML workspace name" + # download + p_download = subparsers.add_parser( + "download", help="Download benchmark results from VM" ) + p_download.set_defaults(func=cmd_download) - # Analyze WAA results - p_analyze = subparsers.add_parser("analyze", help="Analyze WAA benchmark results") - p_analyze.add_argument("--results-dir", help="Path to results directory (local)") - p_analyze.add_argument("--vm-ip", help="IP of Azure VM to analyze results on") - p_analyze.add_argument( - "--remote", - action="store_true", - help="Run analysis on VM via SSH (faster, no download)", - ) - p_analyze.add_argument("--output", help="Output JSON path for summary") + # analyze + p_analyze = subparsers.add_parser("analyze", help="Analyze benchmark results") p_analyze.add_argument( - "--verbose", "-v", action="store_true", help="Show detailed task-level results" + "--results-dir", + help="Results directory (default: most recent in benchmark_results/)", ) + p_analyze.set_defaults(func=cmd_analyze) - # WAA eval VM management - p_vm = subparsers.add_parser( - "vm", help="Manage dedicated WAA eval VM (with nested virtualization)" - ) - p_vm.add_argument( - "action", - choices=[ - # Primary commands - "monitor", # THE GO-TO: dashboard + VNC + status - "status", - "ssh", - "start", - "deallocate", - "delete", - # Setup commands - "create", - "setup", - "list-sizes", - # Docker/container management - "start-windows", - "restart-windows", - "reset-windows", - "docker-prune", - "docker-move", - "fix-docker", - "fix-storage", - "stop-build", - "check-build", - "fix-oem", - # Diagnostics - "diag", - "logs", - "probe", - "exec", - "host-exec", - "screenshot", - # Legacy (prefer top-level 'waa' command) - "pull-image", - "test-docker", - "start-server", - "pool-status", - "delete-pool", - "cleanup-stale", - ], - help="Action to perform (use 'waa' command for full benchmark workflow)", - ) - p_vm.add_argument( - "--resource-group", default="openadapt-agents", help="Azure resource group" - ) - p_vm.add_argument("--name", default="waa-eval-vm", help="VM name") - p_vm.add_argument( - "--size", default="Standard_D8ds_v5", help="VM size (must support nested virt, recommend D8ds_v5 for 300GB temp storage)" - ) - p_vm.add_argument("--location", default="eastus", help="Azure region") - p_vm.add_argument( - "--acr", default="openadaptacr", help="Azure Container Registry name" - ) - p_vm.add_argument( - "--api-key", help="OpenAI API key for WAA agent (or set OPENAI_API_KEY env var)" - ) - p_vm.add_argument( - "--tasks", help="Comma-separated task IDs to run (e.g., notepad_1,notepad_2)" - ) - p_vm.add_argument( - "--num-tasks", type=int, default=5, help="Number of tasks to run (for waa command)" - ) - p_vm.add_argument( - "--domain", - choices=[ - "general", - "office", - "web", - "coding", - "system", - "creative", - "data", - "communication", - "media", - "gaming", - "utility", - ], - help="WAA domain to filter tasks (for waa command)", - ) - p_vm.add_argument( - "--task-ids", - help="Comma-separated task IDs to run (e.g., 'task_001,task_015,task_042') for waa command", - ) - p_vm.add_argument( - "--model", default="gpt-4o", help="Model to use (gpt-4o, gpt-5.2, etc.)" - ) - p_vm.add_argument( - "--agent", - default="navi", - choices=["navi", "api-claude", "api-openai"], - help="Agent type: navi (default WAA), api-claude (Claude Sonnet 4.5), api-openai (GPT-5.1)", - ) - # Multi-worker options - p_vm.add_argument( - "--workers", - type=int, - default=1, - help="Number of worker VMs to create (for setup-waa)", - ) - # Probe options - p_vm.add_argument( - "--wait", action="store_true", help="For probe: Poll until server is ready" - ) - p_vm.add_argument( - "--interval", - type=int, - default=20, - help="For probe: Seconds between poll attempts", - ) - p_vm.add_argument( - "--max-attempts", - type=int, - default=30, - help="For probe: Max poll attempts (default 30 = 10min)", - ) - p_vm.add_argument( - "--internal-ip", - default="172.30.0.2", - help="Internal IP of Windows VM (172.30.0.2 for vanilla WAA)", - ) - p_vm.add_argument( - "--yes", "-y", action="store_true", help="Skip confirmation prompts" - ) - # Viewer auto-launch options (for waa command) - p_vm.add_argument( - "--open", - action="store_true", - default=True, - help="Auto-open benchmark viewer (default: True)", - ) - p_vm.add_argument( - "--no-open", - action="store_false", - dest="open", - help="Disable auto-open of benchmark viewer", - ) - p_vm.add_argument( - "--port", - type=int, - default=8765, - help="Port for local dashboard server (default: 8765)", - ) - # Auto-shutdown option (for waa command) - p_vm.add_argument( - "--auto-shutdown", - action="store_true", - default=False, - help="Deallocate VM after benchmark completes to save costs (for waa command)", - ) - p_vm.add_argument( - "--auto-shutdown-hours", - type=float, - default=0, - help="For monitor: auto-deallocate VM after N hours (0=disabled)", - ) - p_vm.add_argument( - "--details", - action="store_true", - default=False, - help="For monitor: show detailed information (evaluation history, costs per day/week)", - ) - p_vm.add_argument( - "--rebuild", - action="store_true", - default=False, - help="Force re-pull of windowsarena/winarena Docker image (for waa command)", - ) - p_vm.add_argument( - "--fresh", - action="store_true", - default=False, - help="Delete Windows storage and start fresh installation (for waa command)", - ) - # Log viewing options (for logs action) - p_vm.add_argument( - "--lines", - "-n", - type=int, - default=50, - help="Number of log lines to show (for logs)", - ) - p_vm.add_argument( - "--follow", - "-f", - action="store_true", - default=False, - help="Follow log output (for logs)", - ) - # Cleanup-stale options - p_vm.add_argument( - "--max-hours", - type=float, - default=2.0, - help="For cleanup-stale: cancel jobs running longer than this (default: 2 hours)", - ) - p_vm.add_argument( - "--vm-max-hours", - type=float, - default=24.0, - help="For cleanup-stale: deallocate VMs running longer than this (default: 24 hours)", - ) - # Exec command option - p_vm.add_argument("--cmd", help="Command to execute in container (for exec action)") - # Mock data option (for screenshots/testing) - p_vm.add_argument( - "--mock", - action="store_true", - default=False, - help="Use mock data for monitor command (no VM required, for documentation/testing)", + # tasks + p_tasks = subparsers.add_parser("tasks", help="List available WAA benchmark tasks") + p_tasks.add_argument( + "--verbose", "-v", action="store_true", help="Show all task IDs" ) + p_tasks.set_defaults(func=cmd_tasks) - # Benchmark viewer subcommand - for monitoring already-running benchmarks - p_viewer = subparsers.add_parser( - "viewer", help="Launch benchmark viewer for monitoring a running VM" - ) - p_viewer.add_argument( - "--vm-ip", required=True, help="IP address of the Azure VM to monitor" - ) - p_viewer.add_argument( - "--port", - type=int, - default=8765, - help="Port for local dashboard server (default: 8765)", - ) - p_viewer.add_argument( - "--no-open", action="store_true", help="Don't auto-open browser" - ) - p_viewer.add_argument( - "--internal-ip", - default="172.30.0.2", - help="Internal IP of Windows VM (default: 172.30.0.2)", - ) + # deallocate + p_dealloc = subparsers.add_parser("deallocate", help="Stop VM (preserves disk)") + p_dealloc.set_defaults(func=cmd_deallocate) - # View benchmark results - generate and serve HTML viewer for collected benchmark data - p_view = subparsers.add_parser( - "view", help="View benchmark results from collected data" + # logs + p_logs = subparsers.add_parser("logs", help="Show WAA status and logs") + p_logs.add_argument( + "--follow", "-f", action="store_true", help="Stream docker logs continuously" ) - p_view.add_argument( - "--run-name", required=True, help="Name of the benchmark run to view" + p_logs.add_argument( + "--tail", "-n", type=int, help="Number of log lines to show (default: 20)" ) - p_view.add_argument( - "--output", - default="benchmark_results", - help="Base directory containing benchmark runs (default: benchmark_results)", - ) - p_view.add_argument( - "--port", type=int, default=8765, help="Port for local server (default: 8765)" - ) - p_view.add_argument( - "--no-open", action="store_true", help="Don't auto-open browser" - ) - p_view.add_argument( - "--embed-screenshots", + p_logs.add_argument( + "--run", action="store_true", - help="Embed screenshots as base64 (creates larger but standalone HTML)", + help="Show run command output instead of container logs", ) + p_logs.set_defaults(func=cmd_logs) - # Export traces as training data - p_export = subparsers.add_parser( - "export-traces", - help="Export benchmark traces as training data for VLM fine-tuning", - ) - p_export.add_argument("--run-name", help="Name of the benchmark run to export") - p_export.add_argument( - "--input", - default="benchmark_results", - help="Base directory containing benchmark runs (default: benchmark_results)", - ) - p_export.add_argument( - "--output", - "-o", - default="training_data", - help="Output directory for training data (default: training_data)", - ) - p_export.add_argument( - "--status", - choices=["passed", "failed", "all"], - default="passed", - help="Filter tasks by status (default: passed)", - ) - p_export.add_argument( - "--list", "-l", action="store_true", help="List available benchmark runs" - ) - p_export.add_argument( - "--no-screenshots", action="store_true", help="Don't copy screenshots to output" - ) - p_export.add_argument( - "--no-jsonl", - action="store_true", - help="Don't create training_samples.jsonl file", - ) - p_export.add_argument( - "--verbose", "-v", action="store_true", help="Verbose output with stack traces" - ) + # exec + p_exec = subparsers.add_parser("exec", help="Run command on VM host") + p_exec.add_argument("--cmd", required=True, help="Command to run") + p_exec.set_defaults(func=cmd_exec) - # Screenshot capture - p_screenshot = subparsers.add_parser( - "screenshot", - help="Capture screenshots of dashboards and VMs for documentation", - ) - p_screenshot.add_argument( - "--target", - "-t", - action="append", - choices=["azure-ops", "vnc", "terminal", "terminal-live", "training", "vm-screen", "probe", "diag", "status"], - help="Target to capture (can specify multiple, default: all)", - ) - p_screenshot.add_argument( - "--output", - "-o", - default="docs/screenshots", - help="Output directory for screenshots (default: docs/screenshots)", - ) - p_screenshot.add_argument( - "--list", - "-l", - action="store_true", - help="List available screenshot targets", + # docker-exec + p_dexec = subparsers.add_parser( + "docker-exec", help="Run command inside winarena container" ) - p_screenshot.add_argument( - "--no-timestamp", - action="store_true", - help="Don't add timestamp to filenames", - ) - p_screenshot.add_argument( - "--waa", - action="store_true", - help="Capture WAA-specific screenshots (status, probe, vm-screen, diag, vnc)", - ) - p_screenshot.add_argument( - "--pr-mode", - action="store_true", - help="Generate markdown suitable for a PR comment", + p_dexec.add_argument("--cmd", required=True, help="Command to run") + p_dexec.set_defaults(func=cmd_docker_exec) + + # vnc + p_vnc = subparsers.add_parser( + "vnc", help="Open VNC to view Windows desktop via SSH tunnel" ) + p_vnc.set_defaults(func=cmd_vnc) args = parser.parse_args() - - if args.command == "setup": - cmd_setup(args) - elif args.command == "waa": - cmd_waa(args) - elif args.command == "status": - cmd_status(args) - elif args.command == "az-status": - cmd_az_status(args) - elif args.command == "cleanup": - cmd_cleanup(args) - elif args.command == "estimate": - cmd_estimate(args) - elif args.command == "run-local": - setup_logging(getattr(args, "verbose", False)) - cmd_run_local(args) - elif args.command == "run-azure": - setup_logging(getattr(args, "verbose", False)) - cmd_run_azure(args) - elif args.command == "test-mock": - cmd_test_mock(args) - elif args.command == "test-smart": - cmd_test_smart(args) - elif args.command == "test-collection": - cmd_test_collection(args) - elif args.command == "run-api": - cmd_run_api(args) - elif args.command == "waa-demo": - cmd_waa_demo(args) - elif args.command == "create-config": - cmd_create_config(args) - elif args.command == "cleanup-vms": - cmd_cleanup_vms(args) - elif args.command == "list-jobs": - cmd_list_jobs(args) - elif args.command == "job-logs": - cmd_job_logs(args) - elif args.command == "vm": - cmd_vm(args) - elif args.command == "analyze": - cmd_analyze(args) - elif args.command == "viewer": - cmd_viewer(args) - elif args.command == "view": - cmd_view(args) - elif args.command == "export-traces": - cmd_export_traces(args) - elif args.command == "screenshot": - cmd_screenshot(args) - else: - parser.print_help() + sys.exit(args.func(args)) if __name__ == "__main__": diff --git a/openadapt_ml/benchmarks/waa_deploy/Dockerfile b/openadapt_ml/benchmarks/waa_deploy/Dockerfile index 27228ee..02d0817 100644 --- a/openadapt_ml/benchmarks/waa_deploy/Dockerfile +++ b/openadapt_ml/benchmarks/waa_deploy/Dockerfile @@ -91,8 +91,7 @@ RUN find /client -name "*.py" -exec sed -i 's|20.20.20.21|172.30.0.2|g' {} \; && COPY api_agent.py /client/mm_agents/api_agent.py # Note: API agent patching (api-claude, api-openai) skipped for now -# The navi agent works out of the box - API agents can be added later via Python patch -# after the apt-get install python3 step runs +# The navi agent works out of the box - API agents can be added later # ----------------------------------------------------------------------------- # Fix Windows setup for automation @@ -157,15 +156,33 @@ RUN if grep -q "" /run/assets/win11x64.xml; then \ fi # ----------------------------------------------------------------------------- -# Install Python and dependencies directly -# dockurr/windows base is Debian trixie which has Python 3.12 +# Copy Python 3.9 and all packages from vanilla image # ----------------------------------------------------------------------------- - -# Install Python 3 and system dependencies +# IMPORTANT: Do NOT install Python from apt or pip install packages ourselves. +# The vanilla image has Python 3.9.20 with transformers 4.46.2 which is compatible +# with GroundingDINO. Installing our own Python (3.13) with latest transformers (5.0) +# breaks the navi agent with: AttributeError: 'BertModel' has no attribute 'get_head_mask' + +# Copy Python 3.9 installation from vanilla (binaries, libraries, packages) +COPY --from=windowsarena/winarena:latest /usr/local/bin/python* /usr/local/bin/ +COPY --from=windowsarena/winarena:latest /usr/local/bin/pip* /usr/local/bin/ +COPY --from=windowsarena/winarena:latest /usr/local/lib/python3.9 /usr/local/lib/python3.9 +COPY --from=windowsarena/winarena:latest /usr/local/lib/libpython3.9.so* /usr/local/lib/ +COPY --from=windowsarena/winarena:latest /usr/local/include/python3.9 /usr/local/include/python3.9 + +# Ensure the shared library is found +RUN ldconfig + +# Create symlinks for python/pip commands +RUN ln -sf /usr/local/bin/python3.9 /usr/local/bin/python && \ + ln -sf /usr/local/bin/python3.9 /usr/bin/python && \ + ln -sf /usr/local/bin/python3.9 /usr/bin/python3 && \ + ln -sf /usr/local/bin/pip3.9 /usr/local/bin/pip && \ + ln -sf /usr/local/bin/pip3.9 /usr/bin/pip && \ + ln -sf /usr/local/bin/pip3.9 /usr/bin/pip3 + +# Install only system dependencies that Python packages need (not Python itself) RUN apt-get update && apt-get install -y --no-install-recommends \ - python3 \ - python3-venv \ - python3-pip \ tesseract-ocr \ libgl1 \ libglib2.0-0 \ @@ -173,32 +190,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libxext6 \ libxrender-dev \ ffmpeg \ - && rm -rf /var/lib/apt/lists/* \ - && ln -sf /usr/bin/python3 /usr/bin/python - -# Install Python dependencies for WAA client -# Using --break-system-packages since we're in a container -# Full dependency list from: github.com/microsoft/WindowsAgentArena/blob/main/src/win-arena-container/client/requirements.txt -RUN pip3 install --no-cache-dir --break-system-packages \ - torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ - pip3 install --no-cache-dir --break-system-packages \ - gymnasium farama-notifications cloudpickle packaging typer rich tqdm colorama \ - openai anthropic google-generativeai groq tiktoken \ - pyyaml jsonschema tenacity httpx backoff toml func-timeout wrapt-timeout-decorator \ - psutil pyperclip screeninfo mss pyautogui fabric \ - easyocr pillow pytesseract opencv-python-headless scikit-image ImageHash \ - requests flask beautifulsoup4 lxml cssselect xmltodict playwright requests-toolbelt \ - pydrive openpyxl python-docx python-pptx odfpy pypdf PyPDF2 pdfplumber pymupdf borb \ - xlrd xlwt xlsxwriter mammoth pdf2image \ - google-api-python-client google-auth-httplib2 google-auth-oauthlib gdown \ - numpy pandas scipy formulas rapidfuzz anytree addict \ - transformers accelerate "timm>=0.9.0,<1.0.0" ultralytics supervision pycocotools einops \ - mutagen pyacoustid chardet librosa fastdtw \ - py7zr LnkParse3 \ - matplotlib wandb yapf - -# Install Playwright browsers -RUN playwright install chromium + && rm -rf /var/lib/apt/lists/* + +# Note: Playwright browsers not copied - not needed for navi agent (uses GroundingDINO) +# If needed later, install via: python -m playwright install chromium # ----------------------------------------------------------------------------- # Environment configuration