From cfcb12557bd53099a99e396120b1e53f5e198b86 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Thu, 5 Feb 2026 12:48:11 -0500
Subject: [PATCH 1/6] feat(benchmarks): add HTML viewer for WAA pool benchmark
 results

Add pool_viewer.py module and CLI command for generating interactive
HTML viewers from WAA parallel benchmark runs.

Features:
- Parse waa-pool-*.log files to extract task results
- Summary stats (total tasks, success rate, avg time per task)
- Per-worker breakdown showing tasks per worker
- Task list with pass/fail status and step counts
- Domain breakdown with per-domain success rates
- Interactive filters for domain and status

Usage:
  uv run python -m openadapt_ml.benchmarks.cli view-pool
  uv run python -m openadapt_ml.benchmarks.cli view-pool --run-name pool_run_20260204
  uv run python -m openadapt_ml.benchmarks.cli view-pool --no-open

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 openadapt_ml/benchmarks/cli.py         | 329 +++++++++---
 openadapt_ml/benchmarks/pool_viewer.py | 685 +++++++++++++++++++++++++
 2 files changed, 943 insertions(+), 71 deletions(-)
 create mode 100644 openadapt_ml/benchmarks/pool_viewer.py

diff --git a/openadapt_ml/benchmarks/cli.py b/openadapt_ml/benchmarks/cli.py
index b6504ad..010e2cc 100644
--- a/openadapt_ml/benchmarks/cli.py
+++ b/openadapt_ml/benchmarks/cli.py
@@ -80,6 +80,10 @@
     "LogLevel=ERROR",
     "-o",
     "ConnectTimeout=10",
+    "-o",
+    "ServerAliveInterval=60",  # Send keepalive every 60s to prevent timeout
+    "-o",
+    "ServerAliveCountMax=10",  # Allow 10 missed keepalives (~10 min) before disconnect
 ]
 
 
@@ -329,6 +333,101 @@ def wait_for_ssh(ip: str, timeout: int = 120) -> bool:
     return False
 
 
+def set_vm_auto_shutdown(
+    vm_name: str,
+    resource_group: str = RESOURCE_GROUP,
+    shutdown_hours: int = 4,
+) -> bool:
+    """Set Azure auto-shutdown policy on a VM.
+
+    This is a safety net to prevent orphaned VMs from running indefinitely.
+    The VM will be automatically deallocated after the specified hours.
+
+    Args:
+        vm_name: Name of the VM
+        resource_group: Azure resource group
+        shutdown_hours: Hours from now when VM should auto-shutdown (default 4)
+
+    Returns:
+        True if auto-shutdown was set successfully
+    """
+    # Calculate shutdown time (hours from now)
+    from datetime import timedelta
+
+    shutdown_time = datetime.utcnow() + timedelta(hours=shutdown_hours)
+    # Format: HH:MM in UTC
+    shutdown_time_str = shutdown_time.strftime("%H:%M")
+
+    result = subprocess.run(
+        [
+            "az",
+            "vm",
+            "auto-shutdown",
+            "-g",
+            resource_group,
+            "-n",
+            vm_name,
+            "--time",
+            shutdown_time_str,
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    return result.returncode == 0
+
+
+def delete_test_vm_resources(test_name: str, resource_group: str = RESOURCE_GROUP):
+    """Delete a test VM and its associated resources.
+
+    Used for cleanup after quota checking or failed operations.
+    """
+    # Delete VM
+    subprocess.run(
+        [
+            "az",
+            "vm",
+            "delete",
+            "-g",
+            resource_group,
+            "-n",
+            test_name,
+            "--yes",
+            "--force-deletion",
+            "true",
+        ],
+        capture_output=True,
+    )
+    # Delete NIC
+    subprocess.run(
+        [
+            "az",
+            "network",
+            "nic",
+            "delete",
+            "-g",
+            resource_group,
+            "-n",
+            f"{test_name}VMNic",
+        ],
+        capture_output=True,
+    )
+    # Delete public IP
+    subprocess.run(
+        [
+            "az",
+            "network",
+            "public-ip",
+            "delete",
+            "-g",
+            resource_group,
+            "-n",
+            f"{test_name}PublicIP",
+        ],
+        capture_output=True,
+    )
+
+
 # =============================================================================
 # Commands
 # =============================================================================
@@ -420,6 +519,15 @@ def cmd_create(args):
         f"Successfully created {successful_size} (${successful_cost:.2f}/hr) in {region}",
     )
 
+    # Set auto-shutdown as safety net (prevents orphaned VMs)
+    auto_shutdown_hours = getattr(args, "auto_shutdown_hours", 4)
+    if auto_shutdown_hours > 0:
+        log("CREATE", f"Setting auto-shutdown in {auto_shutdown_hours} hours...")
+        if set_vm_auto_shutdown(VM_NAME, RESOURCE_GROUP, auto_shutdown_hours):
+            log("CREATE", "Auto-shutdown configured")
+        else:
+            log("CREATE", "Warning: Failed to set auto-shutdown (VM will stay running)")
+
     # Wait for SSH
     log("CREATE", "Waiting for SSH...")
     if not wait_for_ssh(ip):
@@ -789,88 +897,58 @@ def cmd_pool_create(args):
     working_size = None
     working_region = None
     working_cost = None
+    test_vm_to_cleanup = None  # Track test VM for cleanup
 
     log("POOL", "Finding available region and VM size...")
-    for vm_size, cost in sizes_to_try:
-        for region in VM_REGIONS:
-            # Quick check if this size/region combo works
-            test_name = f"waa-pool-test-{int(time.time())}"
-            result = subprocess.run(
-                [
-                    "az",
-                    "vm",
-                    "create",
-                    "--resource-group",
-                    RESOURCE_GROUP,
-                    "--name",
-                    test_name,
-                    "--location",
-                    region,
-                    "--image",
-                    "Ubuntu2204",
-                    "--size",
-                    vm_size,
-                    "--admin-username",
-                    "azureuser",
-                    "--generate-ssh-keys",
-                    "--public-ip-sku",
-                    "Standard",
-                    "--no-wait",  # Don't wait for completion
-                ],
-                capture_output=True,
-                text=True,
-            )
-            if result.returncode == 0:
-                working_size = vm_size
-                working_region = region
-                working_cost = cost
-                # Delete the test VM and wait for completion
-                log("POOL", "  Found working combo, cleaning up test VM...")
-                subprocess.run(
+    try:
+        for vm_size, cost in sizes_to_try:
+            for region in VM_REGIONS:
+                # Quick check if this size/region combo works
+                test_name = f"waa-pool-test-{int(time.time())}"
+                test_vm_to_cleanup = test_name  # Track for cleanup
+                result = subprocess.run(
                     [
                         "az",
                         "vm",
-                        "delete",
-                        "-g",
+                        "create",
+                        "--resource-group",
                         RESOURCE_GROUP,
-                        "-n",
+                        "--name",
                         test_name,
-                        "--yes",
-                        "--force-deletion",
-                        "true",
-                    ],
-                    capture_output=True,
-                )
-                # Also clean up associated resources
-                subprocess.run(
-                    [
-                        "az",
-                        "network",
-                        "nic",
-                        "delete",
-                        "-g",
-                        RESOURCE_GROUP,
-                        "-n",
-                        f"{test_name}VMNic",
-                    ],
-                    capture_output=True,
-                )
-                subprocess.run(
-                    [
-                        "az",
-                        "network",
-                        "public-ip",
-                        "delete",
-                        "-g",
-                        RESOURCE_GROUP,
-                        "-n",
-                        f"{test_name}PublicIP",
+                        "--location",
+                        region,
+                        "--image",
+                        "Ubuntu2204",
+                        "--size",
+                        vm_size,
+                        "--admin-username",
+                        "azureuser",
+                        "--generate-ssh-keys",
+                        "--public-ip-sku",
+                        "Standard",
+                        "--no-wait",  # Don't wait for completion
                     ],
                     capture_output=True,
+                    text=True,
                 )
+                if result.returncode == 0:
+                    working_size = vm_size
+                    working_region = region
+                    working_cost = cost
+                    # Delete the test VM and wait for completion
+                    log("POOL", "  Found working combo, cleaning up test VM...")
+                    delete_test_vm_resources(test_name, RESOURCE_GROUP)
+                    test_vm_to_cleanup = None  # Cleanup done
+                    break
+                else:
+                    test_vm_to_cleanup = None  # Creation failed, nothing to cleanup
+            if working_size:
                 break
-        if working_size:
-            break
+    finally:
+        # Ensure test VM is cleaned up even if an exception occurred
+        if test_vm_to_cleanup:
+            log("POOL", f"Cleaning up test VM {test_vm_to_cleanup}...")
+            delete_test_vm_resources(test_vm_to_cleanup, RESOURCE_GROUP)
 
     if not working_size:
         log("POOL", "ERROR: No available VM size/region found")
@@ -882,6 +960,11 @@ def cmd_pool_create(args):
 
     log("POOL", f"Using {working_size} (${working_cost:.2f}/hr) in {working_region}")
 
+    # Get auto-shutdown hours (default 4 hours as safety net)
+    auto_shutdown_hours = getattr(args, "auto_shutdown_hours", 4)
+    if auto_shutdown_hours > 0:
+        log("POOL", f"VMs will auto-shutdown in {auto_shutdown_hours} hours")
+
     def create_worker(worker_idx: int) -> tuple[str, str | None, str | None]:
         """Create a single worker VM. Returns (name, ip, error)."""
         name = f"waa-pool-{worker_idx:02d}"
@@ -967,6 +1050,8 @@ def create_worker(worker_idx: int) -> tuple[str, str | None, str | None]:
         try:
             vm_info = json.loads(result.stdout)
             ip = vm_info.get("publicIpAddress", "")
+            # Set auto-shutdown as safety net (prevents orphaned VMs)
+            set_vm_auto_shutdown(name, RESOURCE_GROUP, auto_shutdown_hours)
             return (name, ip, None)
         except json.JSONDecodeError:
             return (name, None, "Failed to parse VM creation output")
@@ -8138,6 +8223,60 @@ def cmd_azure_ml_teardown(args):
     return 0
 
 
+def cmd_view_pool(args):
+    """Generate HTML viewer for WAA pool benchmark results.
+
+    Parses log files from pool_run_* directories and generates an interactive
+    HTML viewer with summary stats, per-worker breakdown, and task list.
+    """
+    import webbrowser
+
+    from openadapt_ml.benchmarks.pool_viewer import generate_pool_results_viewer
+
+    results_dir = Path(args.results_dir) if args.results_dir else Path("benchmark_results")
+
+    # Find pool run directory
+    if args.run_name:
+        pool_dir = results_dir / args.run_name
+        if not pool_dir.exists():
+            # Try with pool_run_ prefix
+            pool_dir = results_dir / f"pool_run_{args.run_name}"
+    else:
+        # Find most recent pool_run_* directory
+        pool_dirs = sorted(results_dir.glob("pool_run_*"), reverse=True)
+        if not pool_dirs:
+            print("No pool_run_* directories found in benchmark_results/")
+            print("Run 'pool-run' to generate benchmark results")
+            return 1
+        pool_dir = pool_dirs[0]
+
+    if not pool_dir.exists():
+        print(f"Directory not found: {pool_dir}")
+        return 1
+
+    # Check for log files
+    log_files = list(pool_dir.glob("waa-pool-*.log"))
+    if not log_files:
+        print(f"No waa-pool-*.log files found in {pool_dir}")
+        return 1
+
+    print(f"Generating viewer for: {pool_dir}")
+    print(f"Found {len(log_files)} log file(s)")
+
+    # Generate viewer
+    output_path = pool_dir / "results.html"
+    generate_pool_results_viewer(pool_dir, output_path)
+
+    print(f"Generated: {output_path}")
+
+    # Open in browser
+    if not args.no_open:
+        print("Opening in browser...")
+        webbrowser.open(f"file://{output_path.absolute()}")
+
+    return 0
+
+
 def cmd_tail_output(args):
     """List or tail background task output files."""
     task_dir = Path("/private/tmp/claude-501/-Users-abrichr-oa-src-openadapt-ml/tasks/")
@@ -8312,6 +8451,12 @@ def main():
         default=1,
         help="Number of worker VMs to create for parallel evaluation (default: 1)",
     )
+    p_create.add_argument(
+        "--auto-shutdown-hours",
+        type=int,
+        default=4,
+        help="Auto-shutdown VM after N hours (0 to disable, default: 4)",
+    )
     p_create.set_defaults(func=cmd_create)
 
     # delete
@@ -8358,6 +8503,12 @@ def main():
     p_pool_create.add_argument(
         "--standard", action="store_true", help="Use D4 (4 vCPU) VMs to save costs"
     )
+    p_pool_create.add_argument(
+        "--auto-shutdown-hours",
+        type=int,
+        default=4,
+        help="Auto-shutdown VMs after N hours (0 to disable, default: 4)",
+    )
     p_pool_create.set_defaults(func=cmd_pool_create)
 
     # pool-wait
@@ -9270,6 +9421,42 @@ def main():
     )
     p_resources.set_defaults(func=cmd_resources)
 
+    # view-pool - Generate HTML viewer for pool benchmark results
+    p_view_pool = subparsers.add_parser(
+        "view-pool",
+        help="Generate HTML viewer for WAA pool benchmark results",
+        description="""
+Generate an interactive HTML viewer for WAA pool benchmark results.
+
+Parses log files from pool_run_* directories to extract task results and
+generates a standalone HTML viewer with:
+  - Summary stats (total tasks, success rate, avg time per task)
+  - Per-worker breakdown
+  - Task list with pass/fail status
+  - Domain breakdown (success rate per domain)
+  - Filters for domain and status
+
+Examples:
+  view-pool                     # View most recent pool_run_* results
+  view-pool --run-name pool_run_20260204  # View specific run
+  view-pool --no-open           # Generate HTML without opening browser
+""",
+    )
+    p_view_pool.add_argument(
+        "--run-name",
+        help="Name of pool run directory (e.g., pool_run_20260204)",
+    )
+    p_view_pool.add_argument(
+        "--results-dir",
+        help="Base results directory (default: benchmark_results/)",
+    )
+    p_view_pool.add_argument(
+        "--no-open",
+        action="store_true",
+        help="Don't auto-open browser",
+    )
+    p_view_pool.set_defaults(func=cmd_view_pool)
+
     args = parser.parse_args()
     sys.exit(args.func(args))
 
diff --git a/openadapt_ml/benchmarks/pool_viewer.py b/openadapt_ml/benchmarks/pool_viewer.py
new file mode 100644
index 0000000..aa3eeb9
--- /dev/null
+++ b/openadapt_ml/benchmarks/pool_viewer.py
@@ -0,0 +1,685 @@
+"""WAA Pool Results Viewer - HTML viewer for parallel benchmark runs.
+
+Parses log files from pool_run_* directories to extract task results and
+generates a standalone HTML viewer with summary stats, per-worker breakdown,
+and domain analysis.
+
+Usage:
+    from openadapt_ml.benchmarks.pool_viewer import generate_pool_results_viewer
+
+    generate_pool_results_viewer(
+        pool_dir=Path("benchmark_results/pool_run_20260204"),
+        output_path=Path("benchmark_results/pool_run_20260204/results.html"),
+    )
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+
+def parse_pool_logs(pool_dir: Path) -> dict[str, Any]:
+    """Parse WAA pool log files to extract task results.
+
+    Args:
+        pool_dir: Directory containing waa-pool-*.log files
+
+    Returns:
+        Dictionary with:
+            - tasks: List of task results
+            - workers: Per-worker stats
+            - metadata: Run metadata (timestamps, model, etc.)
+    """
+    log_files = sorted(pool_dir.glob("waa-pool-*.log"))
+    if not log_files:
+        return {"tasks": [], "workers": {}, "metadata": {}}
+
+    tasks = []
+    workers = {}
+    metadata = {
+        "run_name": pool_dir.name,
+        "log_count": len(log_files),
+        "first_timestamp": None,
+        "last_timestamp": None,
+        "model": None,
+        "num_workers": None,
+    }
+
+    # Regex patterns
+    timestamp_re = re.compile(r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})')
+    domain_re = re.compile(r'\[Domain\]: (\S+)')
+    example_re = re.compile(r'\[Example ID\]: (\S+)')
+    instruction_re = re.compile(r'\[Instruction\]: (.+)')
+    finished_re = re.compile(r'Finished (\S+)/(\S+)')
+    result_re = re.compile(r'Result: ([0-9.]+)')
+    worker_re = re.compile(r'worker_id=(\d+)')
+    model_re = re.compile(r"model='([^']+)'")
+    num_workers_re = re.compile(r'num_workers=(\d+)')
+    step_re = re.compile(r'Step (\d+):')
+
+    for log_file in log_files:
+        worker_id = log_file.stem.replace("waa-pool-", "")
+        workers[worker_id] = {"tasks": 0, "successes": 0, "failures": 0}
+
+        current_task = None
+        last_result = None
+
+        with open(log_file, "r", errors="ignore") as f:
+            for line in f:
+                # Strip ANSI codes
+                clean = re.sub(r'\x1b\[[0-9;]*m', '', line)
+
+                # Extract timestamp
+                ts_match = timestamp_re.search(clean)
+                if ts_match:
+                    ts_str = ts_match.group(1)
+                    if metadata["first_timestamp"] is None:
+                        metadata["first_timestamp"] = ts_str
+                    metadata["last_timestamp"] = ts_str
+
+                # Extract model name
+                if metadata["model"] is None:
+                    model_match = model_re.search(clean)
+                    if model_match:
+                        metadata["model"] = model_match.group(1)
+
+                # Extract num workers
+                if metadata["num_workers"] is None:
+                    nw_match = num_workers_re.search(clean)
+                    if nw_match:
+                        metadata["num_workers"] = int(nw_match.group(1))
+
+                # Domain (comes before Example ID)
+                domain_match = domain_re.search(clean)
+                if domain_match:
+                    if current_task is None:
+                        current_task = {"worker_id": worker_id, "steps": 0}
+                    current_task["domain"] = domain_match.group(1)
+
+                # Example ID
+                example_match = example_re.search(clean)
+                if example_match:
+                    if current_task is None:
+                        current_task = {"worker_id": worker_id, "steps": 0}
+                    current_task["task_id"] = example_match.group(1)
+
+                # Instruction
+                instr_match = instruction_re.search(clean)
+                if instr_match and current_task:
+                    current_task["instruction"] = instr_match.group(1)
+
+                # Step count
+                step_match = step_re.search(clean)
+                if step_match and current_task:
+                    step_num = int(step_match.group(1))
+                    if step_num > current_task.get("steps", 0):
+                        current_task["steps"] = step_num
+
+                # Result line
+                result_match = result_re.search(clean)
+                if result_match:
+                    last_result = float(result_match.group(1))
+
+                # Finished line - finalize task
+                finished_match = finished_re.search(clean)
+                if finished_match:
+                    domain = finished_match.group(1)
+                    task_id = finished_match.group(2)
+
+                    if current_task is None:
+                        current_task = {"worker_id": worker_id, "steps": 0}
+
+                    current_task["domain"] = domain
+                    current_task["task_id"] = task_id
+                    current_task["result"] = last_result if last_result is not None else 0.0
+                    current_task["success"] = last_result is not None and last_result > 0
+                    current_task["timestamp"] = metadata["last_timestamp"]
+
+                    # Update worker stats
+                    workers[worker_id]["tasks"] += 1
+                    if current_task["success"]:
+                        workers[worker_id]["successes"] += 1
+                    else:
+                        workers[worker_id]["failures"] += 1
+
+                    tasks.append(current_task)
+                    current_task = None
+                    last_result = None
+
+    return {
+        "tasks": tasks,
+        "workers": workers,
+        "metadata": metadata,
+    }
+
+
+def get_domain_stats(tasks: list[dict]) -> dict[str, dict[str, int]]:
+    """Calculate per-domain statistics."""
+    domain_stats = {}
+
+    for task in tasks:
+        domain = task.get("domain", "unknown")
+        if domain not in domain_stats:
+            domain_stats[domain] = {"total": 0, "success": 0, "fail": 0}
+
+        domain_stats[domain]["total"] += 1
+        if task.get("success"):
+            domain_stats[domain]["success"] += 1
+        else:
+            domain_stats[domain]["fail"] += 1
+
+    return domain_stats
+
+
+def generate_pool_results_viewer(
+    pool_dir: Path,
+    output_path: Path | None = None,
+) -> Path:
+    """Generate HTML viewer for WAA pool benchmark results.
+
+    Args:
+        pool_dir: Directory containing waa-pool-*.log files
+        output_path: Output HTML path. Defaults to pool_dir/results.html
+
+    Returns:
+        Path to generated HTML file.
+    """
+    pool_dir = Path(pool_dir)
+    if output_path is None:
+        output_path = pool_dir / "results.html"
+
+    # Parse logs
+    data = parse_pool_logs(pool_dir)
+    tasks = data["tasks"]
+    workers = data["workers"]
+    metadata = data["metadata"]
+
+    # Calculate stats
+    num_tasks = len(tasks)
+    num_success = sum(1 for t in tasks if t.get("success"))
+    success_rate = (num_success / num_tasks * 100) if num_tasks > 0 else 0
+
+    # Domain stats
+    domain_stats = get_domain_stats(tasks)
+
+    # Calculate elapsed time
+    elapsed_str = "N/A"
+    if metadata.get("first_timestamp") and metadata.get("last_timestamp"):
+        try:
+            fmt = "%Y-%m-%d %H:%M:%S"
+            start = datetime.strptime(metadata["first_timestamp"], fmt)
+            end = datetime.strptime(metadata["last_timestamp"], fmt)
+            elapsed = end - start
+            hours, remainder = divmod(int(elapsed.total_seconds()), 3600)
+            minutes, seconds = divmod(remainder, 60)
+            if hours > 0:
+                elapsed_str = f"{hours}h {minutes}m {seconds}s"
+            elif minutes > 0:
+                elapsed_str = f"{minutes}m {seconds}s"
+            else:
+                elapsed_str = f"{seconds}s"
+        except Exception:
+            pass
+
+    # Avg time per task
+    avg_time_str = "N/A"
+    if num_tasks > 0 and metadata.get("first_timestamp") and metadata.get("last_timestamp"):
+        try:
+            fmt = "%Y-%m-%d %H:%M:%S"
+            start = datetime.strptime(metadata["first_timestamp"], fmt)
+            end = datetime.strptime(metadata["last_timestamp"], fmt)
+            elapsed = end - start
+            avg_seconds = elapsed.total_seconds() / num_tasks
+            if avg_seconds >= 60:
+                avg_time_str = f"{avg_seconds / 60:.1f}m"
+            else:
+                avg_time_str = f"{avg_seconds:.0f}s"
+        except Exception:
+            pass
+
+    # Generate HTML
+    html = _generate_pool_viewer_html(
+        tasks=tasks,
+        workers=workers,
+        metadata=metadata,
+        domain_stats=domain_stats,
+        num_tasks=num_tasks,
+        num_success=num_success,
+        success_rate=success_rate,
+        elapsed_str=elapsed_str,
+        avg_time_str=avg_time_str,
+    )
+
+    # Write output
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(html)
+
+    return output_path
+
+
+def _generate_pool_viewer_html(
+    tasks: list[dict],
+    workers: dict,
+    metadata: dict,
+    domain_stats: dict,
+    num_tasks: int,
+    num_success: int,
+    success_rate: float,
+    elapsed_str: str,
+    avg_time_str: str,
+) -> str:
+    """Generate HTML content for pool results viewer."""
+
+    # Worker rows HTML
+    worker_rows = ""
+    for worker_id, stats in sorted(workers.items()):
+        rate = (stats["successes"] / stats["tasks"] * 100) if stats["tasks"] > 0 else 0
+        worker_rows += f"""
+            <tr>
+                <td>Worker {worker_id}</td>
+                <td>{stats["tasks"]}</td>
+                <td class="success">{stats["successes"]}</td>
+                <td class="error">{stats["failures"]}</td>
+                <td>{rate:.1f}%</td>
+            </tr>
+        """
+
+    # Domain breakdown HTML
+    domain_tags = ""
+    for domain in sorted(domain_stats.keys()):
+        stats = domain_stats[domain]
+        rate = (stats["success"] / stats["total"] * 100) if stats["total"] > 0 else 0
+        domain_tags += f"""
+            <div class="domain-tag">
+                <span class="domain-name">{domain}</span>
+                <span class="domain-stats">{stats["success"]}/{stats["total"]} ({rate:.0f}%)</span>
+            </div>
+        """
+
+    # Task rows HTML
+    task_rows = ""
+    for i, task in enumerate(tasks):
+        status_class = "success" if task.get("success") else "fail"
+        status_text = "PASS" if task.get("success") else "FAIL"
+        result = task.get("result", 0)
+        task_rows += f"""
+            <tr class="task-row" data-domain="{task.get('domain', 'unknown')}" data-status="{status_class}">
+                <td class="task-id">{task.get('task_id', 'N/A')}</td>
+                <td><span class="domain-badge">{task.get('domain', 'unknown')}</span></td>
+                <td><span class="status-badge {status_class}">{status_text}</span></td>
+                <td>{result:.2f}</td>
+                <td>{task.get('steps', 0)}</td>
+                <td>Worker {task.get('worker_id', '?')}</td>
+            </tr>
+        """
+
+    # Domain filter options
+    domain_options = '<option value="all">All Domains</option>'
+    for domain in sorted(domain_stats.keys()):
+        domain_options += f'<option value="{domain}">{domain}</option>'
+
+    html = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>WAA Pool Results - {metadata.get("run_name", "Unknown")}</title>
+    <style>
+        :root {{
+            --bg-primary: #0a0a0f;
+            --bg-secondary: #12121a;
+            --bg-tertiary: #1a1a24;
+            --border-color: rgba(255, 255, 255, 0.06);
+            --text-primary: #f0f0f0;
+            --text-secondary: #888;
+            --text-muted: #555;
+            --accent: #00d4aa;
+            --accent-dim: rgba(0, 212, 170, 0.15);
+            --success: #34d399;
+            --error: #ff5f5f;
+            --warning: #f59e0b;
+        }}
+        * {{ box-sizing: border-box; margin: 0; padding: 0; }}
+        body {{
+            font-family: "SF Pro Display", -apple-system, BlinkMacSystemFont, "Inter", sans-serif;
+            background: var(--bg-primary);
+            color: var(--text-primary);
+            min-height: 100vh;
+            line-height: 1.5;
+        }}
+        .container {{
+            max-width: 1400px;
+            margin: 0 auto;
+            padding: 24px;
+        }}
+        h1 {{
+            font-size: 1.5rem;
+            font-weight: 600;
+            margin-bottom: 8px;
+        }}
+        .meta-info {{
+            font-size: 0.8rem;
+            color: var(--text-secondary);
+            margin-bottom: 24px;
+            font-family: "SF Mono", Monaco, monospace;
+        }}
+
+        /* Summary Panel */
+        .summary-panel {{
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-color);
+            border-radius: 12px;
+            padding: 20px;
+            margin-bottom: 24px;
+        }}
+        .summary-stats {{
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
+            gap: 16px;
+            margin-bottom: 16px;
+        }}
+        .stat-card {{
+            background: var(--bg-tertiary);
+            border-radius: 8px;
+            padding: 16px;
+        }}
+        .stat-card .stat-value {{
+            font-size: 1.8rem;
+            font-weight: 600;
+            font-family: "SF Mono", Monaco, monospace;
+        }}
+        .stat-card .stat-value.success {{ color: var(--success); }}
+        .stat-card .stat-value.error {{ color: var(--error); }}
+        .stat-card .stat-label {{
+            font-size: 0.7rem;
+            color: var(--text-muted);
+            text-transform: uppercase;
+            letter-spacing: 0.05em;
+            margin-top: 4px;
+        }}
+
+        /* Domain breakdown */
+        .domain-breakdown {{
+            display: flex;
+            flex-wrap: wrap;
+            gap: 8px;
+        }}
+        .domain-tag {{
+            display: inline-flex;
+            align-items: center;
+            gap: 6px;
+            padding: 6px 12px;
+            background: var(--bg-tertiary);
+            border-radius: 6px;
+            font-size: 0.75rem;
+        }}
+        .domain-tag .domain-name {{ color: var(--text-primary); }}
+        .domain-tag .domain-stats {{
+            font-family: "SF Mono", Monaco, monospace;
+            color: var(--text-secondary);
+        }}
+
+        /* Section headers */
+        .section-header {{
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 16px;
+        }}
+        .section-header h2 {{
+            font-size: 1rem;
+            font-weight: 600;
+        }}
+
+        /* Tables */
+        table {{
+            width: 100%;
+            border-collapse: collapse;
+        }}
+        th, td {{
+            padding: 12px;
+            text-align: left;
+            border-bottom: 1px solid var(--border-color);
+        }}
+        th {{
+            font-size: 0.7rem;
+            color: var(--text-muted);
+            text-transform: uppercase;
+            letter-spacing: 0.05em;
+            font-weight: 500;
+        }}
+        td {{
+            font-size: 0.85rem;
+        }}
+        td.success {{ color: var(--success); }}
+        td.error {{ color: var(--error); }}
+        tr:hover {{ background: var(--bg-tertiary); }}
+
+        /* Worker table */
+        .worker-panel {{
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-color);
+            border-radius: 12px;
+            padding: 20px;
+            margin-bottom: 24px;
+        }}
+
+        /* Task list */
+        .task-panel {{
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-color);
+            border-radius: 12px;
+            padding: 20px;
+        }}
+        .task-id {{
+            font-family: "SF Mono", Monaco, monospace;
+            font-size: 0.8rem;
+        }}
+        .status-badge {{
+            font-size: 0.7rem;
+            font-weight: 600;
+            padding: 2px 8px;
+            border-radius: 4px;
+        }}
+        .status-badge.success {{
+            background: rgba(52, 211, 153, 0.2);
+            color: var(--success);
+        }}
+        .status-badge.fail {{
+            background: rgba(255, 95, 95, 0.2);
+            color: var(--error);
+        }}
+        .domain-badge {{
+            font-size: 0.75rem;
+            color: var(--accent);
+        }}
+
+        /* Filters */
+        .filter-bar {{
+            display: flex;
+            gap: 16px;
+            margin-bottom: 16px;
+            flex-wrap: wrap;
+            align-items: center;
+        }}
+        .filter-group {{
+            display: flex;
+            align-items: center;
+            gap: 8px;
+        }}
+        .filter-label {{
+            font-size: 0.7rem;
+            color: var(--text-muted);
+            text-transform: uppercase;
+            letter-spacing: 0.05em;
+        }}
+        .filter-select {{
+            padding: 8px 32px 8px 12px;
+            border-radius: 8px;
+            font-size: 0.85rem;
+            background: var(--bg-tertiary);
+            color: var(--text-primary);
+            border: 1px solid var(--border-color);
+            cursor: pointer;
+            appearance: none;
+            background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 12 12'%3E%3Cpath fill='%23888' d='M3 4.5L6 7.5L9 4.5'/%3E%3C/svg%3E");
+            background-repeat: no-repeat;
+            background-position: right 10px center;
+            transition: all 0.2s;
+        }}
+        .filter-select:hover {{ border-color: var(--accent); }}
+        .filter-count {{
+            font-size: 0.8rem;
+            color: var(--text-secondary);
+            margin-left: auto;
+        }}
+
+        /* Hidden rows */
+        .task-row.hidden {{ display: none; }}
+
+        /* Max height for task list */
+        .task-scroll {{
+            max-height: 600px;
+            overflow-y: auto;
+        }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>WAA Pool Results</h1>
+        <div class="meta-info">
+            Run: {metadata.get("run_name", "Unknown")} |
+            Model: {metadata.get("model", "N/A")} |
+            Workers: {metadata.get("num_workers", len(workers))} |
+            Time: {elapsed_str}
+        </div>
+
+        <!-- Summary Panel -->
+        <div class="summary-panel">
+            <div class="section-header">
+                <h2>Summary</h2>
+            </div>
+            <div class="summary-stats">
+                <div class="stat-card">
+                    <div class="stat-value">{num_tasks}</div>
+                    <div class="stat-label">Total Tasks</div>
+                </div>
+                <div class="stat-card">
+                    <div class="stat-value success">{num_success}</div>
+                    <div class="stat-label">Passed</div>
+                </div>
+                <div class="stat-card">
+                    <div class="stat-value error">{num_tasks - num_success}</div>
+                    <div class="stat-label">Failed</div>
+                </div>
+                <div class="stat-card">
+                    <div class="stat-value {'success' if success_rate >= 50 else 'error'}">{success_rate:.1f}%</div>
+                    <div class="stat-label">Success Rate</div>
+                </div>
+                <div class="stat-card">
+                    <div class="stat-value">{avg_time_str}</div>
+                    <div class="stat-label">Avg Time/Task</div>
+                </div>
+            </div>
+            <div class="domain-breakdown">
+                {domain_tags}
+            </div>
+        </div>
+
+        <!-- Worker Panel -->
+        <div class="worker-panel">
+            <div class="section-header">
+                <h2>Per-Worker Breakdown</h2>
+            </div>
+            <table>
+                <thead>
+                    <tr>
+                        <th>Worker</th>
+                        <th>Tasks</th>
+                        <th>Passed</th>
+                        <th>Failed</th>
+                        <th>Success Rate</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    {worker_rows}
+                </tbody>
+            </table>
+        </div>
+
+        <!-- Task List Panel -->
+        <div class="task-panel">
+            <div class="section-header">
+                <h2>Task Results</h2>
+            </div>
+            <div class="filter-bar">
+                <div class="filter-group">
+                    <span class="filter-label">Domain:</span>
+                    <select class="filter-select" id="domain-filter" onchange="filterTasks()">
+                        {domain_options}
+                    </select>
+                </div>
+                <div class="filter-group">
+                    <span class="filter-label">Status:</span>
+                    <select class="filter-select" id="status-filter" onchange="filterTasks()">
+                        <option value="all">All</option>
+                        <option value="success">Passed</option>
+                        <option value="fail">Failed</option>
+                    </select>
+                </div>
+                <span class="filter-count" id="filter-count">{num_tasks} tasks</span>
+            </div>
+            <div class="task-scroll">
+                <table>
+                    <thead>
+                        <tr>
+                            <th>Task ID</th>
+                            <th>Domain</th>
+                            <th>Status</th>
+                            <th>Result</th>
+                            <th>Steps</th>
+                            <th>Worker</th>
+                        </tr>
+                    </thead>
+                    <tbody id="task-body">
+                        {task_rows}
+                    </tbody>
+                </table>
+            </div>
+        </div>
+    </div>
+
+    <script>
+        function filterTasks() {{
+            const domainFilter = document.getElementById('domain-filter').value;
+            const statusFilter = document.getElementById('status-filter').value;
+
+            let visibleCount = 0;
+            document.querySelectorAll('.task-row').forEach(row => {{
+                const domain = row.dataset.domain;
+                const status = row.dataset.status;
+
+                const matchDomain = domainFilter === 'all' || domain === domainFilter;
+                const matchStatus = statusFilter === 'all' || status === statusFilter;
+
+                if (matchDomain && matchStatus) {{
+                    row.classList.remove('hidden');
+                    visibleCount++;
+                }} else {{
+                    row.classList.add('hidden');
+                }}
+            }});
+
+            document.getElementById('filter-count').textContent = `${{visibleCount}} tasks`;
+        }}
+    </script>
+</body>
+</html>
+"""
+
+    return html

From 8e34e814012ea01b91d9595d9d79cb33f244ebb8 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Thu, 5 Feb 2026 12:49:00 -0500
Subject: [PATCH 2/6] docs(claude): document VM auto-shutdown and orphan
 prevention

Add documentation for the auto-shutdown feature:
- Explain auto-shutdown policy (default 4 hours)
- Document --auto-shutdown-hours flag for pool-create and create
- Document -y flag for pool-cleanup (skip confirmation)
- Document test VM cleanup via try/finally

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 CLAUDE.md | 1607 ++++++++++++-----------------------------------------
 1 file changed, 367 insertions(+), 1240 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index c8c22fe..481d264 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -4,343 +4,124 @@
 
 **Philosophy**: "Less is more. 80/20 impact/complexity. Working code beats elegant design."
 
-**Before writing code, ask**:
-1. Can this be <100 lines? (ideally <50)
-2. Does this provide 80% of value?
-3. Is this the simplest approach?
+**Before writing code**: Can this be <100 lines? Does this provide 80% of value? Is this the simplest approach?
 
-**Red flags to avoid**:
-- Classes when functions work
-- Abstractions before 3rd use
-- Design docs for non-existent code
-- Multiple implementations of same thing
+**Avoid**: Classes when functions work, abstractions before 3rd use, design docs for non-existent code.
 
-**See**: `/Users/abrichr/oa/src/openadapt-evals/SIMPLICITY_PRINCIPLES.md` for full guidelines.
+See: `/Users/abrichr/oa/src/openadapt-evals/SIMPLICITY_PRINCIPLES.md` for full guidelines.
 
 ---
 
-## 🚨🚨🚨 CRITICAL: CLI-FIRST, NEVER RAW COMMANDS 🚨🚨🚨
+## CRITICAL RULES
 
-### THIS IS THE #1 RULE. VIOLATIONS FRUSTRATE THE USER.
+### 0. CHECK RESOURCES ON SESSION START
 
-**NEVER run commands that require user permission. ALWAYS use or extend the CLI.**
+**After context compaction or session start, check for running Azure resources:**
 
-❌ **BANNED** (these require permission, waste user's time):
 ```bash
-# Raw Azure CLI
-az vm start --name ...
-az vm run-command invoke ...
+uv run python -m openadapt_ml.benchmarks.cli resources
+```
 
-# Raw SSH
-ssh azureuser@IP "command"
+This prevents:
+- Forgetting about running VMs (costs ~$0.19-0.38/hr)
+- Creating duplicate resources
+- Losing track of what's deployed
 
-# Raw Python one-liners
-uv run python -c "import subprocess; ..."
+See `RESOURCES.md` for current status (auto-updated by the command).
 
-# Any command not in the pre-approved CLI
-```
+### 1. CLI-FIRST, NEVER RAW COMMANDS
+
+**NEVER run raw commands. ALWAYS use or extend the CLI.**
 
-✅ **REQUIRED** (these are pre-approved, don't ask permission):
 ```bash
-# ALL VM operations go through the CLI
+# BANNED (require user permission, waste time)
+ssh azureuser@IP "anything"
+az vm start --name ...
+az vm run-command invoke ...
+uv run python -c "import subprocess; ..."
+
+# REQUIRED (pre-approved, don't ask permission)
 uv run python -m openadapt_ml.benchmarks.cli vm start
 uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd "command"
 uv run python -m openadapt_ml.benchmarks.cli vm diag
 uv run python -m openadapt_ml.benchmarks.cli vm logs
 ```
 
-### When Functionality Is Missing
-
-**If a CLI command doesn't exist for what you need:**
-1. **EDIT the CLI** to add the new command/action
-2. **THEN call the CLI** command you just added
-3. **NEVER use raw commands** as a workaround
-
-**Example**: Need to restart Docker services?
-```python
-# 1. Add to cli.py under cmd_vm():
-elif action == "fix-docker":
-    # Restart containerd and docker
-    commands = [
-        "sudo systemctl restart containerd",
-        "sudo systemctl restart docker",
-        "docker ps"
-    ]
-    for cmd in commands:
-        run_on_vm(cmd)
-
-# 2. Then call it:
-uv run python -m openadapt_ml.benchmarks.cli vm fix-docker
-```
-
-**This rule exists because:**
-- Raw commands require user approval every time
-- CLI commands are pre-approved and don't interrupt workflow
-- CLI commands are documented and reusable
-- The user has told you this MANY times - LISTEN
-
----
-
-## 🔄 STANDARD WORKFLOW: VM Configuration Changes
-
-**When VM config needs to change (disk size, VM size, etc.):**
-
-1. **Delete the current VM** (if running):
-   ```bash
-   uv run python -m openadapt_ml.benchmarks.cli vm delete -y
-   ```
-
-2. **Update the code** that launches the VM (e.g., `cli.py` defaults)
-
-3. **Launch new VM** with the updated code:
-   ```bash
-   uv run python -m openadapt_ml.benchmarks.cli vm setup-waa  # API key loaded from .env
-   ```
-
-**DO NOT** try to resize/modify running VMs. It's simpler and faster to delete + recreate.
-
-**Current VM defaults** (in `cli.py`):
-- Size: `Standard_D8ds_v5` (300GB temp storage on /mnt)
-- Location: `eastus`
-- OS: Ubuntu 22.04 LTS
-
----
-
-## Project Status & Priorities
-
-**IMPORTANT**: Before starting work, always check the project-wide status document:
-- **Location**: `/Users/abrichr/oa/src/STATUS.md`
-- **Purpose**: Tracks P0 priorities, active background tasks, blockers, and strategic decisions
-- **Action**: Read this file at the start of every session to understand current priorities
-
-This ensures continuity between Claude Code sessions and context compactions.
-
----
-
-This file helps maintain context across sessions.
-
----
-## ⚠️⚠️⚠️ MANDATORY: START DASHBOARD FIRST ⚠️⚠️⚠️
-
-### STOP. READ THIS BEFORE DOING ANYTHING.
-
-**If ANY of these are true, you MUST run the dashboard command IMMEDIATELY:**
-- Session just started or was compacted
-- User mentions VMs, Azure, WAA, benchmark, or Windows
-- You're about to run ANY `vm` subcommand (probe, diag, logs, run-waa, etc.)
-- You want to check benchmark status
-
-**THE COMMAND (run this FIRST, not after other commands):**
-```bash
-uv run python -m openadapt_ml.benchmarks.cli vm monitor
-```
-
-**ENHANCED FEATURES (as of Jan 2026):**
-The `vm monitor` command now provides comprehensive VM usage visibility:
-- **VM Status**: Real-time VM state, size, and IP
-- **Activity Detection**: What the VM is currently doing (idle, benchmark running, setup)
-- **Cost Tracking**: Current uptime, hourly rate, and total cost for session
-- **Azure ML Jobs**: Recent jobs from last 7 days with status
-- **Evaluation History**: Past benchmark runs and success rates (with --details flag)
-- **Dashboard & Tunnels**: Auto-starts web dashboard and SSH/VNC tunnels
-
-**Usage:**
-```bash
-# Basic monitoring
-uv run python -m openadapt_ml.benchmarks.cli vm monitor
-
-# With detailed information (costs per day/week, evaluation history)
-uv run python -m openadapt_ml.benchmarks.cli vm monitor --details
-
-# With auto-shutdown after 2 hours
-uv run python -m openadapt_ml.benchmarks.cli vm monitor --auto-shutdown-hours 2
-```
-
-**WHY THIS MATTERS:**
-- VNC is ONLY accessible via SSH tunnel at `localhost:8006` (NOT the public IP like `http://20.x.x.x:8006`)
-- Azure NSG blocks port 8006 by design - direct access to public IP will NOT work
-- The dashboard auto-manages SSH tunnels for VNC access
-- Shows real-time costs to prevent budget overruns
-- Tracks all Azure ML jobs for visibility into what's running
-- Without it, you cannot see what Windows is doing
-- The user WILL be frustrated if you keep forgetting this
+**If a CLI command doesn't exist**: Edit cli.py to add it, THEN use it. NEVER use raw commands as workaround.
 
-**WRONG (what you keep doing):**
-```bash
-# DON'T do this - checking probe/diag/logs WITHOUT dashboard running
-uv run python -m openadapt_ml.benchmarks.cli vm probe
-uv run python -m openadapt_ml.benchmarks.cli vm diag
-# Then telling user to "run vm monitor" - NO! YOU run it FIRST!
-```
+### 2. START DASHBOARD FIRST FOR VM WORK
 
-**RIGHT (what you should do):**
+**Before ANY vm subcommand (probe, diag, logs, etc.):**
 ```bash
-# ALWAYS start dashboard FIRST, then it handles everything
 uv run python -m openadapt_ml.benchmarks.cli vm monitor
 ```
 
-**After every /compact or session restart, your LITERAL FIRST ACTION must be starting this dashboard if VMs are involved.**
-
----
-## 🔴 MANDATORY: VERIFY URLs BEFORE RECOMMENDING 🔴
+This manages:
+- SSH tunnels (VNC at localhost:8006, WAA at localhost:5001)
+- Real-time cost tracking
+- Azure ML job visibility
+- Auto-opens web dashboard
 
-**BEFORE telling the user to access ANY URL (localhost:XXXX, VNC, dashboard, etc.):**
+**WRONG**: Running `vm probe` then `vm diag` then telling user to run `vm monitor`
+**RIGHT**: Run `vm monitor` FIRST - it handles everything
 
-1. **MANUALLY VERIFY** the URL is accessible by running a curl/check command
-2. **NEVER assume** a service is running just because it was started earlier
-3. **NEVER recommend** a URL based on documentation alone - ALWAYS test first
+### 3. VERIFY URLs BEFORE RECOMMENDING
 
-**Example verification:**
+Always test URLs with curl before telling user to access them:
 ```bash
-# ALWAYS do this BEFORE telling user to visit localhost:8006
-curl -s --connect-timeout 5 http://localhost:8006/ > /dev/null && echo "VNC accessible" || echo "VNC NOT accessible"
+curl -s --connect-timeout 5 http://localhost:8006/ > /dev/null && echo "accessible" || echo "NOT accessible"
 ```
 
-**If verification fails:**
-- Do NOT tell user to access the URL
-- Diagnose why it's not working
-- Fix it first, THEN provide the URL
-
-**This rule exists because:** The user was told to access localhost:8006 when the container was gone. This is unacceptable.
-
----
-## 🚨🚨🚨 STOP! READ THIS BEFORE EVERY COMMAND 🚨🚨🚨
-
-### ABSOLUTELY NEVER USE RAW SSH COMMANDS
-
-**This is the #1 rule. You have been told this MANY times. STOP IGNORING IT.**
-
-❌ **BANNED** (never type these):
-- `ssh azureuser@IP "anything"`
-- `ssh $SSH_OPTS ...`
-- Any command starting with `ssh` to the VM
-
-✅ **REQUIRED** (always use these instead):
-- `uv run python -m openadapt_ml.benchmarks.cli vm exec --cmd "your command"`
-- `uv run python -m openadapt_ml.benchmarks.cli vm diag`
-- `uv run python -m openadapt_ml.benchmarks.cli vm logs`
-
-**If a CLI command doesn't exist, ADD IT TO THE CLI FIRST, then use it.**
-
-**Before running ANY command involving the VM, ask yourself:**
-1. Does this start with `ssh`? → STOP, use CLI instead
-2. Is this a raw shell command to the VM? → STOP, use CLI instead
-3. Can I use `vm exec --cmd`? → YES, use it
-
-This has been explained to you repeatedly. FOLLOW IT.
-
 ---
-## 🔧 DOCKERFILE/VM CHANGES: TEST INSIDE CONTAINER FIRST
 
-**Problem**: Each Dockerfile change triggers: rebuild (10 min) → Windows boot (15 min) → test → repeat. Hours wasted on tiny changes.
+## Project Status
 
-**Solution**: Test fixes INSIDE a running container BEFORE rebuilding:
-
-```bash
-# 1. Start a test container with bash entrypoint (seconds)
-uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd \
-  'docker run -d --name test-fix --entrypoint /bin/bash windowsarena/winarena:latest -c "sleep 3600"'
-
-# 2. Apply your fix manually INSIDE the container (seconds)
-uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd \
-  "docker exec test-fix sed -i 's/old/new/' /some/file.sh"
-
-# 3. Verify the fix works (seconds)
-uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd \
-  "docker exec test-fix cat /some/file.sh"
-
-# 4. Test the actual behavior (seconds)
-uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd \
-  "docker exec test-fix /some/script.sh && ls /expected/output"
-
-# 5. Cleanup
-uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd 'docker rm -f test-fix'
-
-# 6. ONLY AFTER fix is verified: Update Dockerfile and rebuild ONCE
-```
-
-**Why this matters**:
-- Testing a fix takes SECONDS instead of 30+ minutes
-- Iterate 10x on the fix before committing to a rebuild
-- Don't lose context waiting for long builds
-- Each rebuild should be the LAST rebuild, not a guess
-
----
+**IMPORTANT**: Check `/Users/abrichr/oa/src/STATUS.md` at session start for P0 priorities.
 
 ## Project Overview
 
-openadapt-ml is a model-agnostic, domain-agnostic ML engine for GUI automation agents. It provides:
-- Schemas for GUI interaction trajectories
-- Synthetic UI generation for bootstrapping
+openadapt-ml: Model-agnostic ML engine for GUI automation agents.
+- Schemas for GUI trajectories
 - VLM adapters (Qwen3-VL, Qwen2.5-VL, API backends)
 - Supervised fine-tuning pipeline
 - Runtime policy API
 
 ## Current Focus: Demo Retrieval
 
-**Validated**: Demo-conditioned prompting improves action accuracy (Dec 2024)
+**Validated (Dec 2024)**: Demo-conditioned prompting improves accuracy
 - Zero-shot: 33% correct first actions
 - With demo: 100% correct first actions
 - See `docs/experiments/demo_conditioned_prompting_results.md`
 
-**✅ VALIDATED (Jan 17, 2026)**: Demo persistence fix is working
-- The P0 fix in `openadapt-evals` ensures demo is included at EVERY step, not just step 1
-- Mock test confirms: agent behavior changes from 6.8 avg steps (random) to 3.0 avg steps (focused)
-- See `openadapt-evals/CLAUDE.md` for full validation details
-- **Next step**: Run full WAA evaluation (154 tasks) to measure episode success improvement
-
-**Next step**: Build demo retrieval to automatically select relevant demos from a library.
+**Validated (Jan 2026)**: Demo persistence fix working in openadapt-evals
+- Agent behavior: 6.8 avg steps (random) -> 3.0 avg steps (focused)
+- Next: Run full WAA evaluation (154 tasks)
 
-**Key insight**: OpenAdapt's value is **trajectory-conditioned disambiguation of UI affordances**, not "better reasoning".
+**Key insight**: OpenAdapt's value is trajectory-conditioned disambiguation of UI affordances.
 
 ## Benchmark Integration
 
-**Primary benchmark**: Windows Agent Arena (WAA)
+**Primary**: Windows Agent Arena (WAA)
 - 154 tasks across 11 Windows domains
-- MIT licensed, can run locally or on Azure
+- MIT licensed, runs locally or on Azure
 - SOTA: ~19.5% success (GPT-5.1 + OmniParser)
 
-**Future benchmarks** (not yet implemented):
-- WebArena/VisualWebArena (browser)
-- OSWorld (cross-platform desktop)
+**Future benchmarks** (not yet implemented): WebArena, OSWorld
 
----
-
-## 🎯 WAA BENCHMARK WORKFLOW (COMPLETE GUIDE)
+**Code location**: Benchmark code moved to `openadapt-evals` package. openadapt-ml handles VM management only.
 
-### Architecture Overview
+```python
+# NEW (preferred)
+from openadapt_evals import ApiAgent, WAAMockAdapter, evaluate_agent_on_benchmark
 
-```
-┌─────────────────────────────────────────────────────────────────────────┐
-│                         LOCAL MACHINE                                    │
-│                                                                          │
-│  openadapt-ml CLI              openadapt-evals CLI                      │
-│  (VM management)               (benchmark execution)                     │
-│       │                              │                                   │
-│       │  vm monitor                  │  live --server localhost:5001    │
-│       │  vm setup-waa                │  run (shortcut)                  │
-│       │  vm diag                     │                                   │
-│       ▼                              ▼                                   │
-│  ┌─────────────────────────────────────────────────────────────┐       │
-│  │              SSH TUNNELS (auto-managed)                      │       │
-│  │  localhost:5001 ──────► VM:5000 (WAA Flask API)             │       │
-│  │  localhost:8006 ──────► VM:8006 (noVNC)                     │       │
-│  └─────────────────────────────────────────────────────────────┘       │
-└─────────────────────────────────────────────────────────────────────────┘
-                                   │
-                                   │ SSH (port 22)
-                                   ▼
-┌─────────────────────────────────────────────────────────────────────────┐
-│                         AZURE VM (Ubuntu)                                │
-│                                                                          │
-│  Docker                                                                  │
-│  └── windowsarena/winarena:latest                                       │
-│       └── QEMU (Windows 11 Enterprise)                                  │
-│            ├── WAA Flask server (port 5000)                             │
-│            └── Navi agent (executes tasks)                              │
-└─────────────────────────────────────────────────────────────────────────┘
+# Backward compat
+from openadapt_ml.benchmarks import APIBenchmarkAgent, WAAMockAdapter
 ```
 
+---
+
+## WAA Workflow
+
 ### Two CLIs, Two Purposes
 
 | CLI | Repo | Purpose |
@@ -350,1139 +131,485 @@ openadapt-ml is a model-agnostic, domain-agnostic ML engine for GUI automation a
 
 ### API Keys
 
-**API keys are auto-loaded from `.env` via `config.py`**. No need to pass explicitly.
+Auto-loaded from `.env` via `config.py`. No need to pass explicitly.
 
 ```bash
-# .env file (create in repo root, not committed to git)
+# .env file (not committed to git)
 OPENAI_API_KEY=sk-...
 ANTHROPIC_API_KEY=sk-ant-...
 ```
 
-Optional override: `[--api-key KEY]` on any command that needs it.
+### Complete Workflow (Pool - Recommended)
 
-### Complete Workflow (Step by Step)
-
-**Step 1: Setup Azure VM with WAA (first time, ~15 min)**
+**Step 1: Create VM Pool (~10 min)**
 ```bash
-cd /Users/abrichr/oa/src/openadapt-ml
-uv run python -m openadapt_ml.benchmarks.cli vm setup-waa
+# Single VM for quick tests
+uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 1
+
+# Multiple VMs for parallel evaluation
+uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 3
 ```
-This creates VM, installs Docker, pulls Windows image, starts WAA server.
 
-**Step 2: Start Dashboard and Tunnels**
+**Step 2: Wait for WAA Ready (~5-15 min)**
 ```bash
-uv run python -m openadapt_ml.benchmarks.cli vm monitor
+uv run python -m openadapt_ml.benchmarks.cli pool-wait
 ```
-This auto-manages SSH tunnels:
-- `localhost:5001` -> VM:5000 (WAA API)
-- `localhost:8006` -> VM:8006 (VNC)
 
-**Step 3: Run Benchmark (from openadapt-evals)**
+**Step 3: Run Benchmark**
 ```bash
-cd /Users/abrichr/oa/src/openadapt-evals
+# Run 3 tasks for quick validation
+uv run python -m openadapt_ml.benchmarks.cli pool-run --tasks 3
 
-# Quick smoke test (no API key needed)
-uv run python -m openadapt_evals.benchmarks.cli run --agent noop --task notepad_1
-
-# Run with OpenAI (uses OPENAI_API_KEY from .env)
-uv run python -m openadapt_evals.benchmarks.cli run --agent api-openai --task notepad_1
+# Run all 154 tasks
+uv run python -m openadapt_ml.benchmarks.cli pool-run --tasks 154
+```
 
-# Run with Claude (uses ANTHROPIC_API_KEY from .env)
-uv run python -m openadapt_evals.benchmarks.cli run --agent api-claude --task notepad_1
+**Step 4: View Progress and VNC**
+```bash
+# Check status
+uv run python -m openadapt_ml.benchmarks.cli pool-status
 
-# Override API key if needed
-uv run python -m openadapt_evals.benchmarks.cli run --agent api-openai --task notepad_1 --api-key sk-...
+# Open VNC to view Windows desktops
+uv run python -m openadapt_ml.benchmarks.cli pool-vnc
 
-# Multiple tasks
-uv run python -m openadapt_evals.benchmarks.cli run --agent api-openai --tasks notepad_1,notepad_2,browser_1
+# Stream logs
+uv run python -m openadapt_ml.benchmarks.cli pool-logs
 ```
 
-**Step 4: View Results**
+**Step 5: Cleanup (Stop Billing)**
 ```bash
-uv run python -m openadapt_evals.benchmarks.cli view --run-name live_eval
+uv run python -m openadapt_ml.benchmarks.cli pool-cleanup
 ```
 
-**Step 5: Deallocate VM (stops billing)**
+### CLI Commands Reference
+
 ```bash
-cd /Users/abrichr/oa/src/openadapt-ml
-uv run python -m openadapt_ml.benchmarks.cli vm deallocate -y
+# === POOL COMMANDS (Parallel VMs - Recommended) ===
+pool-create --workers N   # Create N VMs with Docker + WAA image
+pool-create --workers N --auto-shutdown-hours 6  # Custom auto-shutdown (default: 4h)
+pool-wait                 # Wait for WAA server ready on all workers
+pool-run --tasks N        # Run N tasks distributed across workers
+pool-status               # Show status of all pool VMs
+pool-vnc                  # Open VNC to pool workers (SSH tunnels)
+pool-logs                 # Stream logs from all workers
+pool-exec --cmd ''        # Execute command on all workers
+pool-cleanup -y           # Delete all pool VMs and resources (no prompt)
+
+# === SINGLE VM COMMANDS ===
+create --fast             # Create single VM (D8ds_v5)
+create --fast --auto-shutdown-hours 6  # Custom auto-shutdown (default: 4h)
+delete                    # Delete VM and all resources
+status                    # Show VM status
+start                     # Start WAA container
+stop                      # Stop WAA container
+probe                     # Check if WAA server is ready
+run --num-tasks N         # Run benchmark on single VM
+vm-start                  # Start a deallocated VM
+deallocate                # Stop VM (preserves disk, stops billing)
+logs                      # Show WAA logs
+vnc                       # Open VNC (SSH tunnel)
+exec --cmd ''             # Run command in container
+docker-exec --cmd ''      # Run command on VM host
+
+# === AZURE ML COMMANDS (Legacy) ===
+run-azure-ml --workers N  # Run on Azure ML compute instances
+azure-ml-quota            # Check quota status
+azure-ml-quota-wait       # Wait for quota approval
 ```
 
-### Quick Reference Commands
+### Quota Auto-Detection
+
+Wait for quota approval before running evaluation:
 
-**From openadapt-ml (VM management):**
 ```bash
-vm monitor        # Start dashboard, tunnels, show status
-vm setup-waa      # First-time VM + WAA setup
-vm diag           # Check disk, Docker, containers
-vm probe          # Check WAA server status
-vm logs           # View container logs
-vm deallocate     # Stop VM billing
-vm delete         # Remove VM entirely
+# Wait for quota (polls every 60 seconds, 24h timeout)
+uv run python -m openadapt_ml.benchmarks.cli azure-ml-quota-wait
+
+# Wait and automatically run evaluation when quota is approved
+uv run python -m openadapt_ml.benchmarks.cli azure-ml-quota-wait --auto-run --tasks 20
+
+# Custom target (e.g., 16 vCPUs for 2 parallel workers)
+uv run python -m openadapt_ml.benchmarks.cli azure-ml-quota-wait --target 16
+
+# Run in background (survives terminal close)
+nohup uv run python -m openadapt_ml.benchmarks.cli azure-ml-quota-wait --auto-run &
 ```
 
-**From openadapt-evals (benchmarks):**
+See `docs/QUOTA_AUTO_DETECTION_DESIGN.md` for full documentation.
+
+### VM Auto-Shutdown and Orphan Prevention
+
+**Auto-shutdown policy**: All VMs are automatically configured with an Azure auto-shutdown policy as a safety net to prevent orphaned VMs from running indefinitely and consuming quota/money.
+
+- **Default**: 4 hours after VM creation
+- **Customizable**: `--auto-shutdown-hours N` (0 to disable)
+- **Azure-level enforcement**: Even if SSH connection drops, the VM will still be deallocated
+
 ```bash
-run               # Simplified live evaluation (uses localhost:5001)
-live              # Full control over server URL
-mock              # Mock evaluation (no VM needed)
-probe             # Check if WAA server is ready
-view              # Generate HTML results viewer
+# Default: auto-shutdown in 4 hours
+uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 3
+
+# Custom: auto-shutdown in 8 hours for long-running evaluations
+uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 3 --auto-shutdown-hours 8
+
+# Disable auto-shutdown (not recommended)
+uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 3 --auto-shutdown-hours 0
 ```
 
-### Key Points to Remember
+**Test VM cleanup**: During `pool-create`, a test VM is created to check quota availability. This test VM is always cleaned up via try/finally, even if the command is interrupted or fails.
 
-1. **SSH tunnels are required** - Azure NSG blocks direct access to ports 5000/8006
-2. **WAA server runs INSIDE Windows** - The Flask server (port 5000) runs in Windows, not on the Ubuntu host
-3. **Default tunnel port is 5001** - Use `--server http://localhost:5001` (not 5000)
-4. **Monitor auto-manages tunnels** - Running `vm monitor` sets up everything
-5. **Results saved to benchmark_results/** - View with `view --run-name <name>`
+**Manual cleanup**: Use `pool-cleanup -y` to clean up orphaned resources without confirmation prompts (useful for automation):
+```bash
+uv run python -m openadapt_ml.benchmarks.cli pool-cleanup -y
+```
 
-### Troubleshooting
+### Azure ML Automated Workflow
+
+For parallel benchmark execution on Azure ML compute instances:
 
-**Problem: "Cannot connect to WAA server"**
 ```bash
-# 1. Is VM running?
-uv run python -m openadapt_ml.benchmarks.cli vm status
+# Single command handles everything:
+# 1. Create/start VM if needed
+# 2. Start Windows container with VERSION=11e
+# 3. Wait for WAA server ready (~15-20 min first time)
+# 4. Upload golden image to blob storage
+# 5. Run Azure ML benchmark with N workers
 
-# 2. Are tunnels active?
-uv run python -m openadapt_ml.benchmarks.cli vm monitor
+uv run python -m openadapt_ml.benchmarks.cli run-azure-ml-auto --workers 4
 
-# 3. Check container
-uv run python -m openadapt_ml.benchmarks.cli vm diag
+# Setup only (golden image, no benchmark)
+uv run python -m openadapt_ml.benchmarks.cli run-azure-ml-auto --skip-benchmark
+
+# Cleanup when done (IMPORTANT - stops billing!)
+uv run python -m openadapt_ml.benchmarks.cli run-azure-ml --teardown --confirm
 ```
 
-**Problem: "Connection refused on localhost:5001"**
-```bash
-# Start tunnels via monitor
-uv run python -m openadapt_ml.benchmarks.cli vm monitor
+See `docs/AZURE_ML_AUTOMATED_WORKFLOW.md` for full documentation.
+
+### Architecture
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                    LOCAL MACHINE                             │
+│  openadapt-ml CLI         openadapt-evals CLI               │
+│  (VM management)          (benchmark execution)              │
+│       │                        │                             │
+│  ┌────────────────────────────────────────────────────────┐ │
+│  │          SSH TUNNELS (auto-managed by monitor)          │ │
+│  │  localhost:5001 → VM:5000 (WAA API)                    │ │
+│  │  localhost:8006 → VM:8006 (noVNC)                      │ │
+│  └────────────────────────────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────────┘
+                        │ SSH (port 22)
+                        ▼
+┌─────────────────────────────────────────────────────────────┐
+│                    AZURE VM (Ubuntu)                         │
+│  Docker                                                      │
+│  └── windowsarena/winarena:latest (Microsoft official)      │
+│       └── QEMU (Windows 11 Enterprise)                      │
+│            ├── WAA Flask server (port 5000)                 │
+│            └── Navi agent (executes tasks)                  │
+└─────────────────────────────────────────────────────────────┘
 ```
 
-**Problem: "Windows not booting"**
+**Key Points**:
+1. SSH tunnels required - Azure NSG blocks direct port access
+2. WAA server runs INSIDE Windows, not on Ubuntu host
+3. Default tunnel port is 5001 (not 5000)
+4. Uses vanilla Microsoft WAA image, no custom Dockerfile
+5. `VERSION=11e` auto-downloads Windows 11 Enterprise Evaluation
+
+---
+
+## VM Configuration Changes
+
+Delete + recreate (don't try to resize running VMs):
 ```bash
-# Check VNC (opens in browser via monitor)
-# Look at container logs
-uv run python -m openadapt_ml.benchmarks.cli vm logs
+uv run python -m openadapt_ml.benchmarks.cli vm delete -y
+# Update cli.py defaults
+uv run python -m openadapt_ml.benchmarks.cli vm setup-waa
 ```
 
+**Current defaults** (in cli.py):
+- Size: `Standard_D8ds_v5` (8 vCPU, 32GB RAM, 300GB temp on /mnt)
+- Location: `eastus`
+- OS: Ubuntu 22.04 LTS
+
 ---
 
 ## Key Architecture Decisions
 
-1. **SoM (Set-of-Marks) mode** - Achieves 100% on synthetic benchmarks by using element IDs instead of coordinates (`CLICK([1])` not `CLICK(x=0.42, y=0.31)`)
-
-2. **Grounding module** - Keep but deprioritize. Useful for deployment on real UIs without SoM overlays. Located in `openadapt_ml/grounding/`
-
-3. **Schema design** - Actions should carry both coordinates AND element grounding (node_id, role, name, bbox) when available
-
-4. **Lossless preservation** - Always store raw benchmark configs verbatim in `raw_config`, `raw_observation`, `raw_action` fields
-
-5. **DOM/AX is mandatory in schema, optional at runtime** - Observations must support `accessibility_tree` and `dom_html` fields for evaluator compatibility (WebArena, WorkArena, Mind2Web need DOM for scoring), even if agents choose vision-only
-
-6. **Cloud-First Development** - While features should work locally for testing, immediately build out cloud compatibility (Azure free tier, Lambda Labs) because:
-   - Most users won't have 96GB RAM locally for VLM training
-   - Developer productivity suffers waiting for long training runs
-   - Training should be as short as possible with feedback as quickly as possible
-   - **Everything should feel fast** - offload heavy compute to cloud GPUs
-   - Cloud providers: Azure (primary, free tier available), Lambda Labs (GPU rental)
-   - See `docs/live_inference_design.md` for async inference architecture
-
-7. **Schema Purity** - The schema must remain domain-agnostic and generic:
-   - **External systems adapt TO the schema**, not the other way around
-   - Never add fields to accommodate specific external data structures
-   - Data transformation belongs in importers/exporters, not core schema
-   - Use `raw` and `metadata` dict fields for integration-specific data
-   - If a proposed field feels specific to one use case, it doesn't belong in the schema
-   - This is a standard open-source library: users import and call functions, they don't shape the API
-   - See `openadapt_ml/schemas/` for canonical definitions
-
-8. **Stub Training Adapter (HIGH PRIORITY)** - Always implement stub/mock providers first:
-   - **Never wait on real training to test UI/code changes**
-   - Use `--stub` flag to simulate training progress without GPU
-   - Generates fake loss curves, evaluations, checkpoints in seconds
-   - Enables rapid iteration on dashboard, viewer, stop button, etc.
-   - See `docs/stub_training_adapter.md` for implementation details
-   - Usage: `uv run python -m openadapt_ml.cloud.lambda_labs monitor --stub --open`
-
-## Expert Feedback
-
-1. **Prompting first** - Establish baselines with off-the-shelf models before fine-tuning
-2. **Prompt engineering matters** - Use structured format: Observation summary → Planning → Possible actions → Action
-3. **Element-based actions** - `Click [8]` instead of coordinates, similar to SoM
-4. **Larger base models** - They used Gemma3 27B; current 2B/8B might be too small
-
-## Benchmark Integration (MIGRATED TO openadapt-evals)
-
-> **IMPORTANT**: Benchmark code has been consolidated into the `openadapt-evals` package.
-> The `openadapt_ml/benchmarks/` directory now contains deprecation stubs that re-export from `openadapt-evals`.
->
-> **Use the new package:**
-> ```python
-> # NEW (preferred)
-> from openadapt_evals import ApiAgent, WAAMockAdapter, evaluate_agent_on_benchmark
->
-> # Also works (backward compat)
-> from openadapt_ml.benchmarks import APIBenchmarkAgent, WAAMockAdapter
-> ```
->
-> **CLI (now in openadapt-evals):**
-> ```bash
-> # NEW (preferred)
-> uv run python -m openadapt_evals.benchmarks.cli mock --tasks 10
-> uv run python -m openadapt_evals.benchmarks.cli live --agent api-claude --server http://vm:5000
->
-> # openadapt-ml CLI still works for VM management
-> uv run python -m openadapt_ml.benchmarks.cli vm monitor
-> ```
-
-The benchmark integration module is now in `openadapt-evals`:
-- `openadapt_evals/adapters/` - BenchmarkAdapter, WAAAdapter, WAALiveAdapter
-- `openadapt_evals/agents/` - BenchmarkAgent, ApiAgent (with P0 demo persistence fix), PolicyAgent
-- `openadapt_evals/benchmarks/` - runner, metrics, viewer, data_collection
-
-### APIBenchmarkAgent
-
-The `APIBenchmarkAgent` wraps hosted VLM APIs (Claude, GPT-5.1) for benchmark evaluation baselines.
-This enables comparing fine-tuned models against off-the-shelf VLMs.
+1. **SoM mode** - Element IDs (`CLICK([1])`) instead of coordinates for 100% accuracy on synthetic benchmarks
 
-```python
-from openadapt_ml.benchmarks import APIBenchmarkAgent, evaluate_agent_on_benchmark
+2. **Grounding module** - Keep but deprioritize. Useful for real UIs without SoM overlays. Located in `openadapt_ml/grounding/`
 
-# Claude baseline
-agent = APIBenchmarkAgent(provider="anthropic")
-results = evaluate_agent_on_benchmark(agent, adapter)
+3. **Schema design** - Actions carry both coordinates AND element grounding when available
 
-# GPT-5.1 baseline
-agent = APIBenchmarkAgent(provider="openai")
-results = evaluate_agent_on_benchmark(agent, adapter)
-```
-
-CLI usage:
-```bash
-# Run Claude evaluation on mock tasks
-uv run python -m openadapt_ml.benchmarks.cli run-api --provider anthropic --tasks 5
+4. **Lossless preservation** - Store raw benchmark configs in `raw_config`, `raw_observation`, `raw_action` fields
 
-# Run GPT-5.1 evaluation
-uv run python -m openadapt_ml.benchmarks.cli run-api --provider openai --tasks 5
+5. **Schema purity** - Domain-agnostic; external systems adapt TO the schema, not vice versa. See `openadapt_ml/schemas/`
 
-# Disable accessibility tree in prompts
-uv run python -m openadapt_ml.benchmarks.cli run-api --no-a11y --tasks 5
-```
+6. **Cloud-first** - Offload heavy compute to cloud GPUs (Azure, Lambda Labs). Everything should feel fast.
 
-The agent:
-- Converts BenchmarkObservation to API format (screenshot + structured prompt)
-- Parses VLM responses into BenchmarkActions using regex patterns
-- Supports CLICK(x,y), CLICK([id]), TYPE("text"), KEY(key), SCROLL(dir), DONE()
-- Stores raw VLM responses in `action.raw_action` for debugging
-
-### Azure Automation
-
-`scripts/setup_azure.py` fully automates Azure setup with 15 steps:
-1. Check Azure CLI installation
-2. Login to Azure
-3. Select subscription
-4. Register resource providers (Compute, ML, Storage, ContainerRegistry)
-5. Create resource group
-6. Create service principal with Contributor role
-7. Create ML workspace
-8. Create Azure Container Registry (ACR)
-9. Import WAA Docker image from Docker Hub to ACR
-10. Attach ACR to ML workspace
-11. Grant AcrPull role to workspace managed identity
-12. Sync workspace keys for ACR authentication
-13. Request GPU quota
-14. Create storage account
-15. Create inference queue and blob containers
-
-The script writes all credentials to `.env` including:
-- Service principal credentials (AZURE_CLIENT_ID, AZURE_CLIENT_SECRET, AZURE_TENANT_ID)
-- Workspace config (AZURE_SUBSCRIPTION_ID, AZURE_ML_RESOURCE_GROUP, AZURE_ML_WORKSPACE_NAME)
-- Docker image path (AZURE_DOCKER_IMAGE) pointing to ACR
-
-**Why ACR?** Azure ML cannot pull from Docker Hub or ghcr.io directly. The image must be in ACR.
-
-**ACR Authentication**: The script automatically configures ACR authentication by granting the workspace's managed identity AcrPull role on the ACR. This ensures compute instances can pull Docker images without requiring admin credentials.
-
-CLI usage:
-```bash
-# Set up Azure (creates resources, ACR, imports image, writes credentials to .env)
-python scripts/setup_azure.py
+7. **Stub training** - Use `--stub` flag for rapid UI iteration without GPU
 
-# Clean up all Azure resources
-python scripts/setup_azure.py --cleanup
+8. **DOM/AX mandatory in schema** - For evaluator compatibility (WebArena, Mind2Web need DOM), even if agents use vision-only
 
-# Estimate Azure costs
-python -m openadapt_ml.benchmarks.cli estimate --workers 40
+---
 
-# Test with mock adapter (no Windows required)
-python -m openadapt_ml.benchmarks.cli test-mock --tasks 20
+## Azure Automation
 
-# Check Azure status
-python -m openadapt_ml.benchmarks.cli status
+`scripts/setup_azure.py` automates 15-step Azure setup:
+- Creates resource group, service principal, ML workspace, ACR
+- Imports WAA Docker image to ACR
+- Configures ACR authentication (AcrPull role)
+- Writes credentials to `.env`
 
-# Run on Azure (WAA submodule auto-detected)
-python -m openadapt_ml.benchmarks.cli run-azure --workers 1
+```bash
+python scripts/setup_azure.py        # Setup
+python scripts/setup_azure.py --cleanup  # Cleanup
 ```
 
-Schema extensions completed in `openadapt_ml/schemas/sessions.py`:
-- `Action`: `target_node_id`, `target_role`, `target_name`, `answer`, `key`, `modifiers`, `scroll_direction`, `scroll_amount`, `end_x`, `end_y`
-- `Observation`: `accessibility_tree`, `dom_html`, `url`, `window_title`, `app_name`, `focused_element`
+---
 
 ## Cloud GPU Training
 
 See `docs/cloud_gpu_training.md` for full documentation.
 
-**Quick start:**
 ```bash
-# Lambda Labs - fully automated training pipeline
-uv run python -m openadapt_ml.cloud.lambda_labs train \
-  --capture /path/to/capture \
-  --goal "Task description"
+# Lambda Labs - automated pipeline
+uv run python -m openadapt_ml.cloud.lambda_labs train --capture /path --goal "Task"
 
-# Or step by step:
+# Step by step
 uv run python -m openadapt_ml.cloud.lambda_labs launch --type gpu_1x_a10
 uv run python -m openadapt_ml.cloud.lambda_labs train-status
 uv run python -m openadapt_ml.cloud.lambda_labs terminate <id>
 ```
 
-**Important**: All cloud operations should be wrapped in CLI commands, not raw SSH. The Lambda Labs module provides:
-- `LambdaLabsClient.setup_instance()` - Clone repo, install deps
-- `LambdaLabsClient.upload_capture()` - rsync capture data
-- `LambdaLabsClient.run_training()` - Execute training
-- `LambdaLabsClient.get_training_status()` - Poll training progress
+---
 
-## Training & Visualization Commands
+## Training Commands
 
 ```bash
-# Train on a capture recording
+# Train on capture
 uv run python -m openadapt_ml.scripts.train \
   --config configs/qwen3vl_capture.yaml \
   --capture /path/to/capture \
-  --open  # opens dashboard in browser
+  --open
 
-# Serve dashboard/viewer via HTTP (RECOMMENDED)
-# Auto-regenerates dashboard.html and viewer.html before serving
+# Serve dashboard (auto-regenerates HTML)
 uv run python -m openadapt_ml.cloud.local serve --port 8080 --open
 
-# Skip regeneration if files are already up to date
-uv run python -m openadapt_ml.cloud.local serve --port 8080 --open --no-regenerate
-
-# Regenerate viewer/dashboard without serving
-# Useful after training completes or to refresh with latest code changes
+# Regenerate viewer without serving
 uv run python -m openadapt_ml.cloud.local viewer
 
-# Compare human vs model predictions
+# Compare human vs model
 uv run python -m openadapt_ml.scripts.compare \
   --capture /path/to/capture \
   --checkpoint checkpoints/model \
   --open
 ```
 
-## Benchmark Data Collection & Testing
-
-```bash
-# Test benchmark data collection (Phase 1)
-# Creates directory structure with screenshots, execution traces, and metadata
-uv run python -m openadapt_ml.benchmarks.cli test-collection --tasks 5
-
-# Custom run name and output directory
-uv run python -m openadapt_ml.benchmarks.cli test-collection \
-  --tasks 10 \
-  --run-name my_test_run \
-  --output benchmark_results \
-  --model-id "my-agent-v1"
-
-# Run the standalone test script (equivalent to test-collection)
-uv run python test_data_collection.py
-```
-
-**Output directory structure:**
-```
-benchmark_results/
-├── {run_name}/
-│   ├── metadata.json        # Benchmark name, model ID, timestamp
-│   ├── summary.json         # Aggregate metrics (success rate, avg steps)
-│   └── tasks/
-│       ├── task_001/
-│       │   ├── task.json       # Task definition
-│       │   ├── execution.json  # Execution trace with steps
-│       │   └── screenshots/
-│       │       ├── step_000.png
-│       │       ├── step_001.png
-│       │       └── ...
-│       └── task_002/
-│           └── ...
-```
-
-**Key files:**
-- `execution.json`: Contains step-by-step trace with actions, reasoning, timestamps
-- `task.json`: Task definition with instruction, domain, time limits
-- `summary.json`: High-level metrics suitable for benchmark viewer
-- `screenshots/`: PNG screenshots at each step
-
-## Viewer Setup Troubleshooting
-
-**Problem**: Viewer shows "No model loaded" after training.
-
-**Root cause**: The viewer requires:
-1. A base `comparison.html` file (from capture or generated during training)
-2. Prediction JSON files (`predictions_*.json`)
-
-**Solution**:
-```bash
-# If comparison.html is missing, copy from the capture directory:
-cp /path/to/capture/comparison.html training_output/
-
-# Then regenerate the viewer:
-uv run python -m openadapt_ml.cloud.local viewer
-
-# Serve and open:
-uv run python -m openadapt_ml.cloud.local serve --open
-```
-
-**Key files in training_output/**:
-- `training_log.json` - Training progress, loss curves, evaluations
-- `dashboard.html` - Training dashboard (auto-regenerated by serve command)
-- `viewer.html` - Capture viewer with predictions (auto-regenerated by serve command)
-- `comparison.html` - Base viewer from capture (needed for viewer generation)
-- `predictions_*.json` - Model predictions by checkpoint (e.g., `predictions_epoch3.json`)
-
-## Files to Know
-
-- `docs/cloud_gpu_training.md` - Lambda Labs and Azure GPU training guide
-- `docs/benchmark_integration_plan.md` - Benchmark integration architecture
-- `docs/azure_waa_setup.md` - Azure WAA setup guide (quota increase, costs, troubleshooting)
-- `docs/design.md` - Overall system design
-- `docs/experiments/demo_conditioned_prompting_results.md` - Demo experiment results (validated Dec 2024)
-- `openadapt_ml/cloud/` - Cloud GPU providers (Lambda Labs, Azure)
-- `openadapt_ml/benchmarks/` - Benchmark integration module (WAA, base classes)
-- `openadapt_ml/experiments/demo_prompt/` - Demo-conditioned prompting experiment
-- `openadapt_ml/grounding/` - Grounding module (GeminiGrounder, etc.)
-- `openadapt_ml/ingest/capture.py` - Converts openadapt-capture recordings to Episodes
-- `scripts/run_demo_experiment.py` - Run demo-conditioned experiment
-- `configs/qwen3vl_synthetic_som.yaml` - SoM training config
+---
 
 ## Code Patterns
 
 ### Environment Variables
-Always load env vars through `openadapt_ml/config.py` using pydantic-settings, NOT directly from `os.environ`:
-
+Use `config.settings`, NOT `os.environ`:
 ```python
 # Good
 from openadapt_ml.config import settings
-api_key = settings.lambda_api_key
+api_key = settings.openai_api_key
 
 # Bad
-api_key = os.environ.get("LAMBDA_API_KEY")
+api_key = os.environ.get("OPENAI_API_KEY")
 ```
 
-This ensures `.env` file is automatically loaded. When adding new env vars:
+When adding new env vars:
 1. Add to `Settings` class in `config.py`
-2. Add to `.env.example` with documentation
-
-### API Keys for CLI Commands
+2. Add to `.env.example`
 
-CLI commands that need API keys (e.g., `waa`, `run-api`) follow this priority:
-1. Command-line argument: `--api-key YOUR_KEY`
-2. Config file: `settings.openai_api_key` from `.env`
-3. Environment variable: `$OPENAI_API_KEY`
+### API Keys for CLI
+Priority: `--api-key` flag > `.env` file > environment variable
 
-**Best practice**: Store keys in `.env` file (not committed to git):
-```bash
-# .env
-OPENAI_API_KEY=sk-...
-ANTHROPIC_API_KEY=sk-ant-...
-```
-
-Then CLI commands work without `--api-key`:
-```bash
-# These load API key from .env automatically
-uv run python -m openadapt_ml.benchmarks.cli waa
-uv run python -m openadapt_ml.benchmarks.cli run-api --provider openai
-```
-
-## File Access
-
-The user has pre-approved read access to:
-- `~/oa/src/` - Parent directory containing related projects (openadapt-capture, etc.)
-
-Related paths:
-- Capture recordings: `/Users/abrichr/oa/src/openadapt-capture/`
-- Screenshots: `/Users/abrichr/oa/src/openadapt-capture/<capture-name>/screenshots/`
-
-## Shared Dashboard Components
-
-The training dashboard and capture viewer share UI components for visual consistency. When modifying dashboard UI:
-
-**Key files:**
-- `openadapt_ml/training/trainer.py` - Contains shared component functions:
-  - `_get_shared_header_css()` - CSS for the unified header
-  - `_generate_shared_header_html()` - HTML generator for nav tabs + controls
-
-**Pattern:**
-1. Define shared CSS/HTML in dedicated functions (prefixed with `_`)
-2. Both `generate_training_dashboard()` and `_enhance_comparison_to_unified_viewer()` call these functions
-3. Changes to shared functions automatically propagate to all dashboards
+---
 
-**Why this matters:**
-- Prevents visual inconsistencies when switching between Training and Viewer tabs
-- Single source of truth for styling (no duplicate CSS to maintain)
-- Easier to add new dashboards that match existing style
+## Dockerfile Testing
 
-## CRITICAL: Always Start Dashboard When Running Azure Resources
+Test fixes INSIDE container before rebuilding (saves 30+ min):
 
-See the ⚠️ MANDATORY section at the TOP of this file. Use:
 ```bash
-uv run python -m openadapt_ml.benchmarks.cli vm monitor
-```
-
-## ⚠️ SAFE PROCESS MANAGEMENT ⚠️
+# 1. Start test container
+uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd \
+  'docker run -d --name test-fix --entrypoint /bin/bash windowsarena/winarena:latest -c "sleep 3600"'
 
-**NEVER use broad pkill patterns** - they can kill unrelated applications!
+# 2. Apply fix
+uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd \
+  "docker exec test-fix sed -i 's/old/new/' /some/file.sh"
 
-**WRONG (DANGEROUS):**
-```bash
-# These patterns are TOO BROAD and will kill unrelated apps:
-pkill -f "openadapt"      # Kills anything with "openadapt" in path
-pkill -f "python"         # Kills ALL Python processes
-pkill -9 -f "openadapt_ml"  # Killed Claude Code, Windsurf, Signal, Chrome tabs!
-```
+# 3. Verify
+uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd \
+  "docker exec test-fix cat /some/file.sh"
 
-**RIGHT (SAFE):**
-```bash
-# Use specific PID-based killing:
-lsof -i :8765 | grep python | awk '{print $2}' | xargs kill 2>/dev/null
+# 4. Cleanup
+uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd 'docker rm -f test-fix'
 
-# Or use specific process names with full path matching:
-pkill -f "python.*-m openadapt_ml.cloud.local serve"
+# 5. ONLY rebuild after fix is verified
+```
 
-# Or kill only the specific port listener:
-kill $(lsof -t -i :8765) 2>/dev/null
+---
 
-# Check what would be killed FIRST:
-pgrep -f "openadapt" -l  # Lists matching processes before killing
-```
+## Files to Know
 
-**Before any pkill command:**
-1. Run `pgrep -f "pattern" -l` to see what matches
-2. Verify only intended processes are listed
-3. Use the most specific pattern possible
-4. Prefer port-based or PID-based killing
+- `docs/WAA_APPROACH_REVIEW.md` - Full WAA setup documentation
+- `docs/cloud_gpu_training.md` - Lambda Labs/Azure training guide
+- `docs/azure_waa_setup.md` - Azure quota, costs, troubleshooting
+- `docs/design.md` - System design
+- `openadapt_ml/benchmarks/cli.py` - VM CLI commands
+- `openadapt_ml/cloud/ssh_tunnel.py` - SSH tunnel manager
+- `openadapt_ml/config.py` - Settings (pydantic-settings)
+- `openadapt_ml/schemas/` - Canonical schema definitions
 
-## Git Commit Style (Angular Convention)
+---
 
-**ALWAYS use Angular-style commit messages** for all commits across all OpenAdapt repositories.
+## Git Commit Style (Angular)
 
-**Format:**
 ```
 <type>(<scope>): <subject>
 
-<body>
-
 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
 ```
 
-**Types:**
-- `feat`: New feature
-- `fix`: Bug fix
-- `docs`: Documentation only
-- `style`: Code style (formatting, semicolons, etc.)
-- `refactor`: Code change that neither fixes a bug nor adds a feature
-- `perf`: Performance improvement
-- `test`: Adding or fixing tests
-- `chore`: Maintenance tasks (deps, build, etc.)
-- `ci`: CI/CD changes
-
-**Examples:**
-```bash
-# Feature
-git commit -m "feat(viewer): add keyboard shortcuts for navigation"
-
-# Bug fix
-git commit -m "fix(waa): resolve Docker storage path issue"
-
-# Documentation
-git commit -m "docs: remove archived OpenAdapter from repository listing"
+**Types**: feat, fix, docs, style, refactor, perf, test, chore, ci
 
-# Refactor
-git commit -m "refactor(cli): consolidate VM commands into single subcommand"
-```
-
-**Subject line rules:**
-- Use imperative mood ("add" not "added" or "adds")
-- No period at the end
-- Max 50 characters
-- Lowercase first letter after type
+**Rules**: Imperative mood, no period, max 50 chars, lowercase after type
 
 ---
 
 ## Don't Do
 
+- Don't use `os.environ` - use `config.settings`
+- Don't use `pip install` - use `uv add` or `uv sync`
+- Don't run VM ops without `vm monitor` first
+- Don't use raw SSH/shell commands - use CLI
+- Don't tell user to run commands - YOU run them
+- Don't use broad pkill patterns (they kill unrelated apps)
 - Don't add timelines/estimates to plans
-- Don't mention specific clients by name in public docs
-- Don't over-engineer - keep solutions minimal
-- Don't use `os.environ` directly - use `config.settings` instead
-- Don't use `pip install` - always use `uv add` for dependencies or `uv sync` for the project
-- Don't use non-Angular commit messages
-- **Don't run Azure/VM operations without starting the dashboard first**
-  - ❌ WRONG: `vm probe` then `vm diag` then telling user to run `vm monitor`
-  - ✅ RIGHT: `vm monitor` FIRST (it does probe, tunnels, everything)
-  - This is the #1 mistake you keep making. STOP IT.
-- **Don't use raw SSH/shell commands** - always use or create CLI commands instead (see below)
-- **Don't tell user to run commands** - YOU run them. The CLI exists so YOU can use it.
-
-## CLI-First Development (IMPORTANT)
-
-**ALWAYS** use CLI commands instead of raw SSH/shell commands:
-- ✅ `uv run python -m openadapt_ml.benchmarks.cli vm diag` (not `ssh ... df -h`)
-- ✅ `uv run python -m openadapt_ml.benchmarks.cli vm logs` (not `ssh ... docker logs`)
-- ✅ `uv run python -m openadapt_ml.benchmarks.cli vm probe` (not `ssh ... curl`)
-
-**Why**: CLI commands are documented, tested, and persist across context compactions. Raw commands are forgotten.
-
-**When you need a new operation**:
-1. Add a new action to the relevant CLI subcommand (e.g., `vm logs`, `vm exec`)
-2. Document it in CLAUDE.md
-3. Use the CLI command going forward
-
-**Available VM CLI commands**:
-```bash
-vm monitor         # THE GO-TO COMMAND: Start dashboard, open browser, show probe status
-                   # Options: --auto-shutdown-hours N (deallocate after N hours)
-vm diag            # Check disk, Docker, containers, WAA probe status
-vm logs            # View container logs (--lines N, --follow)
-vm probe           # Check WAA server status (--wait to poll)
-vm exec            # Run command in container (--cmd 'your command')
-vm host-exec       # Run command on VM host (not in container) (--cmd 'your command')
-vm start-windows   # Start Windows container with vanilla WAA image
-vm restart-windows # Stop and restart the Windows container
-vm reset-windows   # Delete Windows storage and start fresh installation
-vm docker-prune    # Clean Docker images, containers, build cache (free disk space)
-vm docker-move     # Move Docker/containerd to /mnt via symlinks (300GB space with D8ds_v5)
-vm status          # Azure VM status
-vm ssh             # Interactive SSH
-vm deallocate      # Stop VM billing (preserves disk), use -y to skip confirmation
-vm start           # Start a deallocated VM
-vm delete          # Delete VM (use -y to skip confirmation)
-
-# Use 'waa' command instead of deprecated 'vm setup-waa' and 'vm run-waa':
-waa --setup-only   # Full VM setup with Docker and vanilla WAA image
-waa --num-tasks N  # Run benchmark with N tasks
-```
+- Don't mention specific clients by name
 
-## TODO / Known Issues
-
-### Session-Based Cost/Time Tracking
-**Status**: FIXED (Jan 2026)
-
-**Problem**: Dashboard showed cumulative cost/time from VM creation, not current session.
-- User deallocated VM overnight, restarted it today
-- Dashboard showed "$8.82 running cost" and "22h 58m elapsed"
-- This was lifetime cost, not current session cost
-
-**Root cause**: Session tracker (`session_tracker.py`) wasn't integrated with CLI commands.
-- `vm deallocate` didn't call `pause_session()`, so timer kept running
-- `vm start` didn't call `start_session()` to resume properly
-- `vm delete` didn't call `end_session()` or `clear_session()`
-
-**Solution implemented**:
-
-1. **CLI integration**: Added session tracker calls to VM lifecycle commands
-   - `vm deallocate`: Calls `pause_session()` and shows session summary
-   - `vm start`: Calls `start_session()` to resume with accumulated time
-   - `vm delete`: Calls `end_session()` and `clear_session()`
-   - Auto-shutdown in monitor: Calls `pause_session()`
-   - cleanup-stale: Calls `pause_session()` for deallocated VMs
-
-2. **Dashboard hybrid display**: Shows BOTH session and total costs
-   - "This Session: $0.14" - current running time since last start
-   - "Total Cost: $8.82" - accumulated across all sessions
-   - "Total Elapsed: 23h" - total time VM has been running
-
-3. **API enhancements**: Added fields to status response
-   - `current_session_seconds`: Time since last resume
-   - `current_session_cost_usd`: Cost for current session only
-   - `accumulated_seconds`: Time from previous sessions
-
-**Files changed**:
-- `openadapt_ml/benchmarks/cli.py` - Session tracker calls in VM commands
-- `openadapt_ml/cloud/local.py` - API returns session breakdown
-- `openadapt_ml/training/azure_ops_viewer.py` - Dashboard shows both session and total
-
-### PyPI Publishing
-**Status**: DONE
+---
 
-Completed by background agent:
-- Updated `pyproject.toml` with package metadata (description, authors, classifiers, URLs, license)
-- Created `LICENSE` (MIT, matching related projects)
-- Created `.github/workflows/publish.yml` for automated PyPI publishing on version tags
-- Build system: hatchling
+## Safe Process Management
 
-To publish:
-1. Set up PyPI trusted publishing (PyPI → Account Settings → Publishing)
-2. `git tag v0.1.0 && git push origin v0.1.0`
+```bash
+# WRONG (kills unrelated apps)
+pkill -f "openadapt"
+pkill -f "python"
 
-### Azure WAA Evaluation - ACR Auth Issue
-**Status**: FIXED - setup_azure.py now configures ACR authentication automatically
+# RIGHT (specific)
+kill $(lsof -t -i :8765) 2>/dev/null
+pkill -f "python.*-m openadapt_ml.cloud.local serve"
 
-**Problem**: Azure ML compute instances cannot pull from ACR even after attaching ACR to workspace.
+# Check before killing
+pgrep -f "pattern" -l
 ```
-Failed to pull Docker image openadaptacr.azurecr.io/winarena:latest
-```
-
-**Root cause**: The workspace's managed identity needed AcrPull role on the ACR, which wasn't being granted automatically.
-
-**Solution implemented**:
-1. Added `grant_acr_pull_role()` function to setup_azure.py that:
-   - Gets workspace managed identity principal ID
-   - Assigns AcrPull role on ACR to that identity
-2. Added `sync_workspace_keys()` to refresh workspace credentials
-3. Updated setup flow from 12 steps to 15 steps:
-   - Step 10: Attach ACR to workspace
-   - Step 11: Grant AcrPull role to workspace managed identity
-   - Step 12: Sync workspace keys
-
-**Related files**:
-- `scripts/setup_azure.py` - Azure setup automation (includes ACR auth)
-- `openadapt_ml/benchmarks/azure.py` - Azure orchestration
-- `.env` - AZURE_DOCKER_IMAGE setting
 
-**References**:
-- [Azure ML Managed Identity ACR Authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-identity-based-service-authentication)
-- [ACR Pull Role Assignment](https://learn.microsoft.com/en-us/azure/container-registry/container-registry-authentication-managed-identity)
-
-### Azure WAA Evaluation - Dedicated VM Setup
-**Status**: WORKING - Vanilla Microsoft WAA (Jan 2026)
+---
 
-**IMPORTANT**: See `docs/WAA_APPROACH_REVIEW.md` for full documentation.
+## File Access
 
-**CRITICAL**: Uses vanilla Microsoft WAA (windowsarena/winarena). No custom Dockerfile.
+Pre-approved read access to `~/oa/src/` (related projects like openadapt-capture).
 
-**How it works**:
-- Uses official `windowsarena/winarena:latest` Docker image from Microsoft
-- Uses `VERSION=11e` env var to auto-download Windows 11 Enterprise Evaluation
-- Container runs `entry.sh` which boots Windows and starts WAA server automatically
-- First run: Downloads Windows + installs (~15-20 min)
-- Subsequent runs: Boots from cached disk image (~2-3 min)
+## Current Capture
 
-**FULLY AUTOMATED - Via CLI**:
+Path: `/Users/abrichr/oa/src/openadapt-capture/turn-off-nightshift`
+Task: Turn off Night Shift in macOS System Settings
 
-```bash
-# 1. Setup Azure VM with Docker and pull vanilla WAA image (~10 min)
-uv run python -m openadapt_ml.benchmarks.cli waa --api-key $OPENAI_API_KEY --setup-only
+---
 
-# 2. Run benchmark
-uv run python -m openadapt_ml.benchmarks.cli waa --api-key $OPENAI_API_KEY --num-tasks 20
+## TODO / Known Issues
 
-# 3. Monitor (optional, for debugging)
-uv run python -m openadapt_ml.benchmarks.cli vm monitor
-# Opens browser to VNC at http://localhost:8006
+### Benchmark Viewer - Phase 4
+**Status**: TODO
 
-# 4. Delete VM when done (IMPORTANT: stops billing!)
-uv run python -m openadapt_ml.benchmarks.cli vm delete -y
-```
+Add failure clustering and regression detection. Phases 1-3 done:
+- Data collection with ExecutionTraceCollector
+- Viewer generation with `view --run-name {name}`
+- UI with summary, task list, step replay, playback controls
 
-**Diagnostic commands**:
-```bash
-uv run python -m openadapt_ml.benchmarks.cli vm diag     # Check disk, Docker, containers
-uv run python -m openadapt_ml.benchmarks.cli vm status   # Azure VM status
-uv run python -m openadapt_ml.benchmarks.cli vm ssh      # Interactive SSH
-uv run python -m openadapt_ml.benchmarks.cli vm probe    # Check WAA server readiness
-uv run python -m openadapt_ml.benchmarks.cli vm logs     # View container logs
-```
+### Azure ML Experiment ID
+**Status**: TODO
 
-**Screenshot capture** (for PR documentation):
-```bash
-# List available screenshot targets
-uv run python -m openadapt_ml.benchmarks.cli screenshot --list
-
-# Capture WAA-specific screenshots for PR
-uv run python -m openadapt_ml.benchmarks.cli screenshot --waa --pr-mode
-
-# Capture specific targets
-uv run python -m openadapt_ml.benchmarks.cli screenshot --target status --target probe --pr-mode
-
-# Available targets:
-#   status    - Azure VM status
-#   probe     - WAA probe endpoint status
-#   diag      - VM diagnostic info
-#   vm-screen - Windows VM screen (via QEMU)
-#   vnc       - VNC viewer (localhost:8006)
-#   terminal  - VM monitor terminal output
-#   azure-ops - Azure ops dashboard
-#   training  - Training dashboard
-```
+Retrieve experiment_id dynamically instead of hardcoded UUID.
 
-**Key requirements**:
-1. **VM Size**: `Standard_D8ds_v5` recommended (8 vCPU, 32GB RAM, 300GB temp storage for nested virtualization)
-2. **API key**: `config.json` with OPENAI_API_KEY (or set env var)
-3. **Valid model**: Use real OpenAI model name (gpt-4o, gpt-4o-mini)
+### Azure ML Port 80 Conflict
+**Status**: INVESTIGATING
 
-**Architecture**:
+Azure ML compute instances have Microsoft infrastructure services on port 80. When vanilla WAA's dockur/windows container starts, nginx tries to bind to port 80 and fails:
 ```
-Azure VM (Standard_D8ds_v5, nested virt enabled, 300GB /mnt)
-  └── Docker (data on /mnt)
-       └── windowsarena/winarena:latest (official Microsoft image)
-            └── QEMU running Windows 11 (IP: 172.30.0.2)
-                 └── WAA Flask server on port 5000
-                 └── Navi agent executing tasks
+nginx: [emerg] bind() to 0.0.0.0:80 failed (98: Address already in use)
 ```
 
-**How vanilla WAA works**:
-1. Uses `windowsarena/winarena:latest` from Docker Hub
-2. `VERSION=11e` triggers auto-download of Windows 11 Enterprise Evaluation
-3. `entry.sh` handles Windows boot and server startup
-4. No custom patching or Dockerfile required
-
-**Monitor progress**:
-- VNC: `http://localhost:8006` (via SSH tunnel, auto-managed by dashboard)
-- Logs: `uv run python -m openadapt_ml.benchmarks.cli vm logs`
-
-**Files**:
-- `docs/WAA_APPROACH_REVIEW.md` - Full analysis (updated Jan 2026)
-- `vendor/WindowsAgentArena/` - Official WAA scripts (run-local.sh, etc.)
-- `openadapt_ml/benchmarks/cli.py` - CLI commands
-
-### Docker Disk Space Management
-**Status**: FIXED - Automatic cleanup (Jan 2026)
-
-**Problem**: Docker build cache on /mnt was growing to 90+ GB during builds, exhausting disk space and causing builds to fail with "no space left on device". Note: With Standard_D8ds_v5, /mnt is now 300GB which should be sufficient.
-
-**Root cause**: Docker's build cache and containerd snapshotter accumulate data that isn't cleaned by `docker system prune`:
-- `/mnt/docker/buildkit/containerd-overlayfs` - BuildKit layer cache
-- `/mnt/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots` - Containerd snapshots
-- These can grow to 30-40 GB each, even with no images present
+**Key insight**: Port 80 is just nginx redirecting to noVNC on port 8006. **NOT essential for WAA**.
+- Port 5000: WAA Flask API (benchmark execution) - ESSENTIAL
+- Port 8006: noVNC (browser VNC) - ESSENTIAL
+- Port 80: nginx redirect - NOT ESSENTIAL
 
-**Solution implemented** (3 parts):
+**What we're testing**:
+1. `WEB=N` env var to disable nginx entirely
+2. SSH tunnel to access ports 8006 and 5000 for debugging
+3. Enhanced diagnostics in run_entry.py to verify Windows boots despite nginx failure
 
-1. **Automatic pre-build cleanup**: Before Docker builds, the CLI now runs `docker builder prune -af` and checks available disk space, warning if < 50GB.
+**SSH key support added**: Compute instances now use your local SSH key (~/.ssh/id_rsa) for direct SSH access.
 
-2. **Automatic post-build cleanup**: After successful builds, the CLI cleans build cache and dangling images to prevent accumulation.
+See `docs/AZURE_ML_PORT_80_FIX.md` for full analysis and options.
 
-3. **BuildKit garbage collection**: New VMs are configured with `/etc/buildkit/buildkitd.toml` that limits cache to 30GB max.
+### Azure ML CLI Commands
 
-4. **Enhanced docker-prune command**: Now includes "deep cleanup" that stops Docker/containerd and removes orphaned snapshots that normal prune misses.
-
-**Usage**:
 ```bash
-# Quick cleanup (standard prune + deep cleanup + configure GC)
-uv run python -m openadapt_ml.benchmarks.cli vm docker-prune
-
-# For severe disk issues, delete VM and recreate (comes with GC pre-configured)
-uv run python -m openadapt_ml.benchmarks.cli vm delete -y
-uv run python -m openadapt_ml.benchmarks.cli vm setup-waa ```
-
-**Files changed**:
-- `openadapt_ml/benchmarks/cli.py` - Pre/post build cleanup, enhanced docker-prune
-- New VMs get BuildKit GC config during setup
-
-### Windows "Select Operating System" Prompt Fix
-**Status**: N/A with vanilla WAA (Jan 2026)
-
-**Note**: This issue was specific to the custom waa-auto Dockerfile approach which has been deprecated.
-
-With vanilla WAA (`windowsarena/winarena:latest`), using `VERSION=11e` automatically selects Windows 11 Enterprise Evaluation which has proper autounattend.xml handling.
-
-**If you still see the prompt**:
-1. Delete cached storage: `uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd 'rm -rf /mnt/waa-storage/*'`
-2. Re-run setup: `uv run python -m openadapt_ml.benchmarks.cli waa --api-key $OPENAI_API_KEY --fresh`
-
-### SSH Tunnel Management (VNC/WAA Access)
-**Status**: DONE
+# Status and monitoring
+azure-ml-status           # Show compute instances and recent jobs
+azure-ml-logs --job NAME  # Stream logs from running job
+azure-ml-monitor          # Interactive monitor with VNC tunnel
 
-**Problem**: Azure VMs have Network Security Groups (NSGs) that only expose port 22 (SSH) by default. Ports 8006 (VNC) and 5000 (WAA) are not accessible directly.
+# Run benchmarks
+run-azure-ml-auto --workers N  # Fully automated workflow
 
-**Solution**: Automatic SSH tunnel management via `SSHTunnelManager`:
+# Cleanup (IMPORTANT - stop billing!)
+azure-ml-cancel           # Cancel running job (or --job NAME)
+azure-ml-delete-compute   # Delete compute instance (--name NAME or --all)
+azure-ml-cleanup --yes    # Cancel all jobs + delete all instances
 
-```
-Browser → localhost:8006 → SSH Tunnel → Azure VM:8006 → Docker → noVNC
-Browser → localhost:5001 → SSH Tunnel → Azure VM:5000 → WAA Flask
-```
-
-**Architecture**:
-1. When VM's WAA probe becomes "ready", tunnels auto-start
-2. When VM goes offline, tunnels auto-stop
-3. Dashboard shows tunnel status next to VNC button
-4. VNC button links to localhost:port (tunnel endpoint)
-
-**Files**:
-- `openadapt_ml/cloud/ssh_tunnel.py` - SSHTunnelManager class
-- `openadapt_ml/cloud/local.py` - Integration with dashboard server
-- `openadapt_ml/training/benchmark_viewer.py` - UI showing tunnel status
-
-**API Endpoints**:
-- `GET /api/tunnels` - Returns tunnel status for VNC and WAA
-- `GET /api/vms` - Includes `tunnels` field with per-tunnel status
-
-**Key features**:
-- Auto-start on VM online (idempotent - safe to call repeatedly)
-- Auto-stop on VM offline
-- Port conflict detection
-- Graceful shutdown on process exit
-- No manual SSH commands needed
-
-**Manual usage** (if needed):
-```python
-from openadapt_ml.cloud.ssh_tunnel import get_tunnel_manager
-
-manager = get_tunnel_manager()
-manager.start_tunnels_for_vm("172.171.112.41", "azureuser")
-status = manager.get_tunnel_status()
-manager.stop_all_tunnels()
-```
-
-**Why not open NSG ports?**
-1. VNC has no authentication by default - anyone can connect
-2. SSH tunnel encrypts all traffic
-3. Requires SSH key auth - no password guessing
-4. No Azure NSG changes needed
-
-**Alternative: Mock evaluation** for testing without Windows:
-```bash
-uv run python -m openadapt_ml.benchmarks.cli test-mock --tasks 20
+# Resource management
+resources                 # Show all Azure resources and costs
 ```
 
-**References**:
-- [Windows Agent Arena GitHub](https://github.com/microsoft/WindowsAgentArena)
-- [Azure nested virtualization](https://learn.microsoft.com/en-us/azure/virtual-machines/acu)
-
-### Training Dashboard - Terminal Output Streaming
-**Status**: DONE
-
-**Goal**: Show training command line output in the browser dashboard in real-time.
-
-**Implementation**: File-based polling approach
-1. Training writes stdout to `training_output/training.log` with timestamps
-2. Browser polls training.log every 2 seconds alongside training_log.json
-3. Displays last 500 lines in scrollable terminal panel with auto-scroll
-4. Terminal panel features:
-   - Dark terminal theme (black background, green/colored text)
-   - Auto-scroll toggle (on by default)
-   - Text wrap toggle
-   - Collapse/expand button
-   - Line counter
-   - Syntax highlighting (errors in red, warnings in orange, success in green)
-
-**Files changed**:
-- `openadapt_ml/training/trainer.py`:
-  - Added terminal panel CSS styles
-  - Added terminal panel HTML section
-  - Added JavaScript polling function `fetchTerminalOutput()`
-  - Added `TrainingLogger._log_to_terminal()` method
-  - Updated `train_supervised()` to log key messages to training.log
-- `openadapt_ml/training/stub_provider.py`:
-  - Added `_log()` method for dual stdout/file logging
-  - All training output now written to training.log
-- `openadapt_ml/cloud/local.py`:
-  - No changes needed - serve command already serves all files from training_output
-
-**Usage**: Terminal output automatically appears in dashboard during training. Works with both stub and real training.
-
-### Early Termination Controls
-**Status**: DONE
-
-**Problem**: Training runs until completion even when loss is low enough. Wastes GPU credits ($0.75/hr for A10).
-
-**Solution implemented**:
-1. **Auto-termination**: `early_stop_loss` and `early_stop_patience` in stub_provider.py
-2. **Dashboard button**: "Stop Training" button calls `/api/stop` endpoint
-3. **Stop signal**: Creates `STOP_TRAINING` file that training loop checks
-4. **Termination status**: Dashboard shows termination reason (auto_complete, auto_low_loss, user_stop)
-
-**Files changed**:
-- `openadapt_ml/cloud/local.py` - Added `/api/stop` POST endpoint
-- `openadapt_ml/training/stub_provider.py` - Added early stop logic, termination status
-- `openadapt_ml/training/trainer.py` - Added `updateTerminationStatus()` JS function
-
-### Cloud Cost Estimation in Viewers
-**Status**: DONE
-
-Added cost display panel to viewer that shows:
-- Running cost based on instance type and elapsed time
-- Instance type and hourly rate
-- Only visible for cloud training (hidden for local/stub)
-
-Supported rates:
-- Lambda Labs: $0.75/hr for A10, $1.29/hr for A100
-- Automatic detection from `instance_type` in training_log.json
-
-### Current Working Capture
-**Path**: `/Users/abrichr/oa/src/openadapt-capture/turn-off-nightshift`
-**Task**: Turn off Night Shift in macOS System Settings
-**Screenshots**: 20 frames
-**Notes**: Real-world macOS settings navigation capture for training/evaluation
-
-### Evaluation Samples Display Enhancement
-**Status**: DONE
-
-Enhanced evaluation gallery in dashboard with:
-- **Filter controls**: Dropdown filters for epoch and correctness (All/Correct/Incorrect)
-- **Visual markers**: H (human) and AI (predicted) click markers on screenshots
-- **Expandable model output**: "Show full output" toggle for raw model reasoning
-- **Better layout**: Image container with overlay, content section with coordinates
-- **Sample count**: "Showing X of Y samples" with filter status
-
-Files changed:
-- `openadapt_ml/training/trainer.py` - Enhanced CSS, HTML, and JS for eval gallery
-
-### Viewer Playback Controls
-**Status**: DONE
-
-Added full playback controls to the viewer:
-- **Buttons**: ⏮ Rewind, ◀ Prev, ▶ Play/Pause, ▶ Next, ⏭ End
-- **Speed control**: 0.5x, 1x, 2x, 4x playback speeds
-- **Progress bar**: Click-to-seek to any step
-- **Keyboard shortcuts**: Space (play/pause), Home/End (jump), Arrow keys (step)
-- **Enhanced details panel**: Shows full model output with scrollable raw prediction data
-
-### Viewer Code Consolidation
-**Status**: DONE
-
-**Problem**: Viewer code was fragmented across multiple locations:
-1. `generate_training_dashboard()` - generates unified viewer template
-2. `_enhance_comparison_to_unified_viewer()` - injected checkpoint_script into comparison.html
-3. `comparison.html` from capture - had its own display logic
-
-**Solution implemented**:
-- `generate_unified_viewer_from_output_dir()` now always uses `_generate_unified_viewer_from_extracted_data()`
-- This generates a complete standalone viewer.html without script injection
-- `_enhance_comparison_to_unified_viewer()` marked as deprecated
-- All viewer display logic is now in one place (`_generate_unified_viewer_from_extracted_data`)
-- Changes to viewer code now propagate reliably
-
-### README API Documentation
-**Status**: VERIFIED
-
-The README §7.1 API-backed adapters section uses correct model names:
-- "Claude Sonnet 4.5" → `claude-sonnet-4-5-20250929` in api_adapter.py ✓
-- "GPT-5.1" → `gpt-5.1` in api_adapter.py ✓
-
-Verified:
-- API key environment variable names: ANTHROPIC_API_KEY, OPENAI_API_KEY ✓
-- Backend flag options: `claude`, `openai` in CLI ✓
-
-### Benchmark Viewer Integration
-**Status**: Phases 1-3 DONE, Phase 4 TODO
-
-**Goal**: Integrate benchmark evaluation results (WAA, WebArena, OSWorld) into the unified viewer.
-
-**Design doc**: `docs/benchmark_viewer_integration.md`
-
-**Key features**:
-1. **Benchmarks tab**: Third tab alongside Training and Viewer
-2. **Task-level view**: List of benchmark tasks with pass/fail status
-3. **Step-by-step replay**: Same UI as Viewer tab for benchmark executions
-4. **Model comparison**: Side-by-side comparison of different models on same task (TODO)
-5. **Aggregate metrics**: Success rate by domain, difficulty rankings
-
-**Implementation phases**:
-1. ✅ **Data collection** (DONE): Save screenshots during benchmark runs
-   - Created `openadapt_ml/benchmarks/data_collection.py` with `ExecutionTraceCollector`
-   - Updated `runner.py` to save execution traces automatically
-   - Added CLI command: `uv run python -m openadapt_ml.benchmarks.cli test-collection --tasks 5`
-   - Directory structure: `benchmark_results/{run_name}/tasks/{task_id}/`
-   - Each task has: `task.json`, `execution.json`, `screenshots/`
-   - Test script: `test_data_collection.py` validates all files are created
-2. ✅ **Viewer backend** (DONE): `generate_benchmark_viewer()` function
-   - Created `openadapt_ml/benchmarks/viewer.py` with viewer generation
-   - Added CLI command: `uv run python -m openadapt_ml.benchmarks.cli view --run-name {name}`
-   - Generates standalone HTML with same styling as training viewer
-   - Uses shared header components via `shared_ui.py`
-3. ✅ **UI components** (DONE - Basic): Summary dashboard, task list, replay
-   - Summary panel with total tasks, passed/failed, success rate
-   - Domain breakdown with per-domain statistics
-   - Filter controls (domain, status)
-   - Task list with status badges
-   - Step-by-step viewer with screenshots, actions, reasoning
-   - Playback controls (prev/next, play/pause, speed)
-   - Keyboard shortcuts (Space, arrows, Home/End)
-4. **Analysis** (TODO): Failure clustering, regression detection
-
-**View benchmark results:**
-```bash
-# Generate HTML viewer and serve it
-uv run python -m openadapt_ml.benchmarks.cli view --run-name {name}
-
-# Options:
-# --embed-screenshots  Embed screenshots as base64 (standalone HTML)
-# --no-open            Don't auto-open browser
-# --port 9000          Use custom port
-```
-
-## Preventing Stale Data Issues
-
-**CRITICAL**: When working on dashboard/viewer code, follow this process to avoid showing stale data:
-
-### After Code Changes
-
-1. **Always regenerate HTML files** after modifying trainer.py, viewer.py, or local.py:
-   ```bash
-   uv run python -m openadapt_ml.cloud.local viewer
-   ```
-
-2. **Verify regeneration worked** by checking key values:
-   ```bash
-   # Check elapsed time was updated (should NOT be 0)
-   grep "baseElapsedTime" training_output/current/dashboard.html
-
-   # Check comparison data exists in viewer
-   grep "predictionsByCheckpoint" training_output/current/viewer.html
-   ```
-
-3. **Hard refresh browser** to bypass cache:
-   - macOS: `Cmd+Shift+R`
-   - Windows/Linux: `Ctrl+Shift+R`
-   - Or use DevTools → Network → "Disable cache" checkbox
-
-4. **Use HTTP serving** (not file://) for auto-refresh:
-   ```bash
-   uv run python -m openadapt_ml.cloud.local serve --port 8080 --open
-   ```
-
-### Before Showing User
-
-Before presenting dashboard/viewer to user, verify:
-- [ ] Elapsed time shows correct value (not 0m 0s)
-- [ ] Comparison screenshots load (not blank/404)
-- [ ] Model predictions appear in dropdown
-- [ ] Loss curve shows data
-- [ ] Timestamp info panel shows recent dates
-
-### Automatic Data Loading Checklist
-
-The viewer should automatically load:
-- [ ] Capture data from `comparison_epoch*.html` files (extracts `window.comparisonData`)
-- [ ] Predictions from same comparison HTML files (human + predicted actions per step)
-- [ ] Evaluations from `training_log.json` (if present)
-- [ ] Recording events from capture data (note: `recording.end` depends on capture source)
-
-### Common Issues
+---
 
-| Symptom | Cause | Fix |
-|---------|-------|-----|
-| Elapsed time shows 0m 0s | `elapsed_time` not loaded from training_log.json | Check `state.elapsed_time = data.get("elapsed_time", 0.0)` in local.py |
-| No comparison screenshots | Paths point to Lambda not local | Update `capture_path` in training_log.json to local path |
-| Missing model predictions | No `comparison_epoch*.html` files or wrong data format | Run compare script: `uv run python -m openadapt_ml.scripts.compare --capture ... --checkpoint ...` |
-| Predictions not extracted | HTML uses `window.comparisonData` but regex expects `const` | Use regex `(?:const\s+\|window\.)comparisonData` pattern |
-| Stale data after code change | Browser caching HTML | Hard refresh (Cmd+Shift+R) or disable cache |
-| Screenshots 404 | Screenshot symlink broken | Recreate: `ln -sf /path/to/capture/screenshots training_output/current/screenshots` |
+## Troubleshooting
 
-### UI/Display Guidelines
+### Dashboard/Viewer Stale Data
+After code changes:
+1. Regenerate: `uv run python -m openadapt_ml.cloud.local viewer`
+2. Hard-refresh browser: Cmd+Shift+R
 
-**Placeholder data must be clearly marked** when displaying values that may not reflect actual data:
-- If task counts, worker counts, etc. come from local tracking (not synced with Azure), mark them with an asterisk: "3* tasks • 1* worker(s)"
-- Add a footnote: "[*: placeholder, actual values may differ]"
-- This applies to any data that is locally cached but not confirmed from the authoritative source
+### WAA Connection Issues
+1. Is VM running? `vm status`
+2. Are tunnels active? `vm monitor`
+3. Check container: `vm diag`
 
-### Azure ML Integration Notes
+### Windows Not Booting
+1. Check VNC via `vm monitor`
+2. Check logs: `vm logs`
 
-**Experiment ID**: The Azure ML experiments page URL requires an experiment ID which is workspace-specific:
-- Current hardcoded ID: `ad29082c-0607-4fda-8cc7-38944eb5a518`
-- **TODO**: Retrieve experiment_id dynamically from Azure using `az ml experiment list`
-- The experiment name is `openadapt-ml` but the URL requires the UUID format
+### Common Issues Table
 
-**Azure ML URL format**:
-- Jobs list: `https://ml.azure.com/experiments/id/{experiment_id}?wsid={workspace_id}`
-- Specific job: `https://ml.azure.com/experiments/id/{experiment_id}/runs/{run_id}?wsid={workspace_id}`
+| Symptom | Fix |
+|---------|-----|
+| Connection refused localhost:5001 | Run `vm monitor` to start tunnels |
+| Windows not booting | Check VNC, check `vm logs` |
+| Elapsed time shows 0 | Check training_log.json has elapsed_time |
+| No comparison screenshots | Update capture_path in training_log.json |
+| Stale data after code change | Hard refresh (Cmd+Shift+R) |
 
-**WAA Docker command**: Use `python run.py` not `python -m client.run` (the client directory is not a Python package)
+See `docs/` for detailed troubleshooting guides.

From c45d27bc62d7cecee1ad551f920e88eb7233e047 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Thu, 5 Feb 2026 15:59:52 -0500
Subject: [PATCH 3/6] docs(readme): update CLI commands to use pool-* workflow

Update documentation to reflect the current working CLI:
- Replace outdated `vm monitor` with `pool-status/pool-vnc/pool-logs`
- Update single VM workflow to use `pool-create --workers 1`
- Add analyze_pool_logs.py script for parsing benchmark results

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .gitignore                   |   4 +-
 README.md                    |  72 +++++-------
 pyproject.toml               |   3 +-
 scripts/analyze_pool_logs.py | 213 +++++++++++++++++++++++++++++++++++
 4 files changed, 247 insertions(+), 45 deletions(-)
 create mode 100644 scripts/analyze_pool_logs.py

diff --git a/.gitignore b/.gitignore
index 1f4a400..0bab75f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,8 +11,10 @@ wheels/
 .venv
 local_context_openadapt_ml_internal.md
 
-# Environment variables
+# Environment variables and secrets
 .env
+config.json
+vendor/WindowsAgentArena/config.json
 
 # Ephemeral synthetic assets (frames, debug sessions, etc.)
 synthetic/
diff --git a/README.md b/README.md
index dd7b38c..9b3af6a 100644
--- a/README.md
+++ b/README.md
@@ -813,48 +813,31 @@ uv run python -m openadapt_ml.cloud.local serve --port 8080 --open
 
 *View benchmark evaluation results with task-level filtering, success/failure status, and run comparison. Shows Claude achieving 30% on mock evaluation tasks (simulated environment for testing the pipeline - real WAA evaluation requires Windows VMs).*
 
-### 13.4 VM Monitoring Dashboard
+### 13.4 VM Pool Monitoring
 
-For managing Azure VMs used in benchmark evaluations, the `vm monitor` command provides a comprehensive dashboard:
+For managing Azure VMs used in benchmark evaluations:
 
 ```bash
-# Start VM monitoring dashboard (auto-opens browser)
-uv run python -m openadapt_ml.benchmarks.cli vm monitor
-
-# Show detailed information (evaluation history, daily/weekly costs)
-uv run python -m openadapt_ml.benchmarks.cli vm monitor --details
-```
-
-**VM Monitor Dashboard (Full View):**
-
-![VM Monitor Dashboard](docs/screenshots/vm_monitor_dashboard_full.png)
-
-*The VM monitor dashboard shows: (1) VM status (name, IP, size, state), (2) Current activity (idle/benchmark running), (3) Cost tracking (uptime, hourly rate, total cost), (4) Recent Azure ML jobs from last 7 days, and (6) Dashboard & access URLs.*
-
-**VM Monitor Dashboard (With --details Flag):**
+# Check pool status (VM state, IPs, WAA readiness)
+uv run python -m openadapt_ml.benchmarks.cli pool-status
 
-![VM Monitor Dashboard Details](docs/screenshots/vm_monitor_details.png)
+# Open VNC to view Windows desktops (via SSH tunnels)
+uv run python -m openadapt_ml.benchmarks.cli pool-vnc
 
-*The --details flag adds: (5) Evaluation history with success rates and agent types, plus extended cost information (daily/weekly projections).*
+# Stream logs from all workers
+uv run python -m openadapt_ml.benchmarks.cli pool-logs
+```
 
 **Features:**
 - **Real-time VM status** - Shows VM size, power state, and IP address
-- **Activity detection** - Identifies if VM is idle, running benchmarks, or in setup
-- **Cost tracking** - Displays uptime hours, hourly rate, and total cost for current session
-- **Azure ML jobs** - Lists recent jobs from last 7 days with status indicators
-- **Evaluation history** - Shows past benchmark runs with success rates (with --details flag)
-- **Dashboard & tunnels** - Auto-starts web dashboard and SSH/VNC tunnels for accessing Windows VM
+- **WAA readiness** - Shows if WAA server is ready on each worker
+- **VNC access** - Opens SSH tunnels to view Windows desktops
+- **Log streaming** - Interleaved logs from all pool workers
 
-**Mock mode for testing:**
+**Cleanup (important to stop billing):**
 ```bash
-# Generate screenshots or test dashboard without a VM running
-uv run python -m openadapt_ml.benchmarks.cli vm monitor --mock
-```
-
-**Auto-shutdown option:**
-```bash
-# Automatically deallocate VM after 2 hours to prevent runaway costs
-uv run python -m openadapt_ml.benchmarks.cli vm monitor --auto-shutdown-hours 2
+# Delete all pool VMs and resources
+uv run python -m openadapt_ml.benchmarks.cli pool-cleanup
 ```
 
 ### 13.5 Benchmark Execution Logs
@@ -1017,20 +1000,24 @@ Windows Agent Arena (WAA) is a benchmark of 154 tasks across 11 Windows domains.
 
 ### 14.2 Single VM Workflow
 
-For quick testing or small runs:
+For quick testing or small runs (use pool-create with --workers 1):
 
 ```bash
-# Setup VM with WAA
-uv run python -m openadapt_ml.benchmarks.cli vm setup-waa
+# 1. Create single-VM pool
+uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 1
 
-# Start monitoring dashboard (auto-opens VNC, manages SSH tunnels)
-uv run python -m openadapt_ml.benchmarks.cli vm monitor
+# 2. Wait for WAA ready
+uv run python -m openadapt_ml.benchmarks.cli pool-wait
+
+# 3. Run benchmark (e.g., 3 tasks for quick test)
+uv run python -m openadapt_ml.benchmarks.cli pool-run --tasks 3
 
-# Run benchmark
-uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 10
+# 4. Check status / VNC
+uv run python -m openadapt_ml.benchmarks.cli pool-status
+uv run python -m openadapt_ml.benchmarks.cli pool-vnc
 
-# Deallocate when done (stops billing)
-uv run python -m openadapt_ml.benchmarks.cli vm deallocate -y
+# 5. Cleanup (stop billing)
+uv run python -m openadapt_ml.benchmarks.cli pool-cleanup
 ```
 
 ### 14.3 Parallel Pool Workflow (Recommended)
@@ -1102,8 +1089,7 @@ Azure (N VMs, Standard_D8ds_v5)
 
 **Tips:**
 - Always run `pool-cleanup` when done to delete VMs and stop billing
-- Use `vm deallocate` (not delete) to pause billing but keep disk
-- Set `--auto-shutdown-hours 2` on `vm monitor` for safety
+- Use `deallocate` (not `delete`) to pause billing but keep disk for single VM
 - Prices vary by Azure region
 
 ---
diff --git a/pyproject.toml b/pyproject.toml
index 35d2720..3808938 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,8 +24,9 @@ classifiers = [
 dependencies = [
     "azure-ai-ml>=1.30.0",
     "azure-identity>=1.25.1",
+    "azureml-core>=1.61.0.post1",
     "bitsandbytes>=0.41.0", # For 4-bit quantization
-    "click>=8.1.0",  # CLI framework
+    "click>=8.1.0", # CLI framework
     "google-generativeai>=0.8.5",
     "matplotlib>=3.10.7",
     "openadapt-capture>=0.1.0",
diff --git a/scripts/analyze_pool_logs.py b/scripts/analyze_pool_logs.py
new file mode 100644
index 0000000..80bdfac
--- /dev/null
+++ b/scripts/analyze_pool_logs.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+"""Analyze WAA pool benchmark logs and generate HTML summary.
+
+Usage:
+    python scripts/analyze_pool_logs.py benchmark_results/pool_run_20260204/
+"""
+
+import re
+import sys
+from pathlib import Path
+from datetime import datetime
+
+
+def parse_log_file(log_path: Path) -> dict:
+    """Parse a WAA benchmark log file."""
+    content = log_path.read_text()
+
+    # Extract task completions
+    tasks = []
+    finished_pattern = r"Finished (\w+)/([a-f0-9-]+-WOS)"
+    result_pattern = r"Result: ([\d.]+)"
+
+    # Find all finished tasks with their results
+    finished_matches = list(re.finditer(finished_pattern, content))
+    result_matches = list(re.finditer(result_pattern, content))
+
+    for i, match in enumerate(finished_matches):
+        domain = match.group(1)
+        task_id = match.group(2)
+        # Find the result that precedes this finish
+        result = 0.0
+        for rm in result_matches:
+            if rm.start() < match.start():
+                result = float(rm.group(1))
+        tasks.append({
+            "domain": domain,
+            "task_id": task_id,
+            "result": result,
+            "success": result > 0.0
+        })
+
+    # Extract total task count from progress bar
+    total_match = re.search(r"Example:\s+\d+%\|.*?\|\s+\d+/(\d+)", content)
+    total_tasks = int(total_match.group(1)) if total_match else 0
+
+    return {
+        "file": log_path.name,
+        "tasks_completed": len(tasks),
+        "tasks_total": total_tasks,
+        "tasks": tasks,
+        "successes": sum(1 for t in tasks if t["success"]),
+    }
+
+
+def generate_html_report(results: list, output_path: Path) -> None:
+    """Generate HTML summary report."""
+    total_completed = sum(r["tasks_completed"] for r in results)
+    total_tasks = sum(r["tasks_total"] for r in results)
+    total_success = sum(r["successes"] for r in results)
+    success_rate = (total_success / total_completed * 100) if total_completed > 0 else 0
+
+    # Group by domain
+    domain_stats = {}
+    for r in results:
+        for task in r["tasks"]:
+            domain = task["domain"]
+            if domain not in domain_stats:
+                domain_stats[domain] = {"total": 0, "success": 0}
+            domain_stats[domain]["total"] += 1
+            if task["success"]:
+                domain_stats[domain]["success"] += 1
+
+    html = f"""<!DOCTYPE html>
+<html>
+<head>
+    <title>WAA Benchmark Results</title>
+    <style>
+        body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 40px; background: #f5f5f5; }}
+        .container {{ max-width: 1000px; margin: 0 auto; }}
+        h1 {{ color: #333; }}
+        .summary {{ background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
+        .stats {{ display: flex; gap: 30px; margin: 20px 0; }}
+        .stat {{ text-align: center; }}
+        .stat-value {{ font-size: 36px; font-weight: bold; color: #2563eb; }}
+        .stat-label {{ color: #666; font-size: 14px; }}
+        table {{ width: 100%; border-collapse: collapse; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
+        th, td {{ padding: 12px 16px; text-align: left; border-bottom: 1px solid #eee; }}
+        th {{ background: #f8f9fa; font-weight: 600; }}
+        .success {{ color: #16a34a; }}
+        .fail {{ color: #dc2626; }}
+        .worker {{ margin-top: 30px; }}
+        .badge {{ display: inline-block; padding: 2px 8px; border-radius: 4px; font-size: 12px; }}
+        .badge-success {{ background: #dcfce7; color: #16a34a; }}
+        .badge-fail {{ background: #fee2e2; color: #dc2626; }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>WAA Benchmark Results</h1>
+
+        <div class="summary">
+            <h2>Summary</h2>
+            <div class="stats">
+                <div class="stat">
+                    <div class="stat-value">{total_completed}/{total_tasks}</div>
+                    <div class="stat-label">Tasks Completed</div>
+                </div>
+                <div class="stat">
+                    <div class="stat-value">{total_success}</div>
+                    <div class="stat-label">Successes</div>
+                </div>
+                <div class="stat">
+                    <div class="stat-value">{success_rate:.1f}%</div>
+                    <div class="stat-label">Success Rate</div>
+                </div>
+                <div class="stat">
+                    <div class="stat-value">{len(results)}</div>
+                    <div class="stat-label">Workers</div>
+                </div>
+            </div>
+        </div>
+
+        <div class="summary">
+            <h2>By Domain</h2>
+            <table>
+                <tr><th>Domain</th><th>Completed</th><th>Success</th><th>Rate</th></tr>
+"""
+
+    for domain, stats in sorted(domain_stats.items()):
+        rate = (stats["success"] / stats["total"] * 100) if stats["total"] > 0 else 0
+        html += f"""                <tr>
+                    <td>{domain}</td>
+                    <td>{stats['total']}</td>
+                    <td>{stats['success']}</td>
+                    <td>{rate:.0f}%</td>
+                </tr>
+"""
+
+    html += """            </table>
+        </div>
+"""
+
+    for r in results:
+        html += f"""
+        <div class="worker">
+            <h2>{r['file']}</h2>
+            <p>Completed: {r['tasks_completed']}/{r['tasks_total']} tasks</p>
+            <table>
+                <tr><th>Domain</th><th>Task ID</th><th>Result</th><th>Status</th></tr>
+"""
+        for task in r["tasks"]:
+            status_class = "badge-success" if task["success"] else "badge-fail"
+            status_text = "PASS" if task["success"] else "FAIL"
+            html += f"""                <tr>
+                    <td>{task['domain']}</td>
+                    <td><code>{task['task_id'][:20]}...</code></td>
+                    <td>{task['result']:.2f}</td>
+                    <td><span class="badge {status_class}">{status_text}</span></td>
+                </tr>
+"""
+        html += """            </table>
+        </div>
+"""
+
+    html += f"""
+        <footer style="margin-top: 40px; color: #666; font-size: 12px;">
+            Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+        </footer>
+    </div>
+</body>
+</html>
+"""
+
+    output_path.write_text(html)
+    print(f"Generated: {output_path}")
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python scripts/analyze_pool_logs.py <results_dir>")
+        sys.exit(1)
+
+    results_dir = Path(sys.argv[1])
+    if not results_dir.exists():
+        print(f"Directory not found: {results_dir}")
+        sys.exit(1)
+
+    # Find log files
+    log_files = list(results_dir.glob("waa-pool-*.log"))
+    if not log_files:
+        print(f"No log files found in {results_dir}")
+        sys.exit(1)
+
+    print(f"Found {len(log_files)} log files")
+
+    # Parse logs
+    results = []
+    for log_file in sorted(log_files):
+        print(f"  Parsing {log_file.name}...")
+        results.append(parse_log_file(log_file))
+
+    # Generate HTML
+    output_path = results_dir / "results.html"
+    generate_html_report(results, output_path)
+
+    # Print summary
+    total_completed = sum(r["tasks_completed"] for r in results)
+    total_success = sum(r["successes"] for r in results)
+    print(f"\nSummary: {total_completed} tasks completed, {total_success} successes ({total_success/total_completed*100:.0f}% rate)" if total_completed > 0 else "\nNo tasks completed")
+
+
+if __name__ == "__main__":
+    main()

From bcfd7e60b16786540818ef39e00daba46bc7bb2b Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Fri, 6 Feb 2026 10:22:42 -0500
Subject: [PATCH 4/6] fix(cli): prevent orphaned test VMs during pool-create

Remove --no-wait flag from test VM creation so the VM fully exists
before we attempt to delete it. Previously, the test VM would still
be provisioning when delete was called, causing delete to fail
silently and leave orphaned VMs consuming quota.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .gitignore                                  |   7 +
 docs/ralph_waa_guide.md                     | 702 ++++++++++++++++++++
 docs/research/cua_comparison.md             | 532 +++++++++++++++
 openadapt_ml/benchmarks/cli.py              |   4 +-
 openadapt_ml/benchmarks/resource_tracker.py | 302 +++++++++
 scripts/check_azure_resources.sh            |  22 +
 scripts/check_quota.py                      | 195 ++++++
 tests/test_quota_auto_detection.py          | 308 +++++++++
 8 files changed, 2071 insertions(+), 1 deletion(-)
 create mode 100644 docs/ralph_waa_guide.md
 create mode 100644 docs/research/cua_comparison.md
 create mode 100644 openadapt_ml/benchmarks/resource_tracker.py
 create mode 100755 scripts/check_azure_resources.sh
 create mode 100644 scripts/check_quota.py
 create mode 100644 tests/test_quota_auto_detection.py

diff --git a/.gitignore b/.gitignore
index 0bab75f..54c9cbe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -74,3 +74,10 @@ segmentation_output/
 # Internal documentation (not for public repo)
 docs/internal/
 docs/private/
+
+# Trash folder for session cruft
+_trash/
+
+# Auto-generated files
+RESOURCES.md
+azure_logs/
diff --git a/docs/ralph_waa_guide.md b/docs/ralph_waa_guide.md
new file mode 100644
index 0000000..a90fe65
--- /dev/null
+++ b/docs/ralph_waa_guide.md
@@ -0,0 +1,702 @@
+# Ralph Loop for WAA Benchmark Automation
+
+**Date**: 2026-02-02
+**Purpose**: Practical guide for adapting the Ralph Loop pattern to our WAA benchmarking workflow
+**Status**: Implementation guide (ready for Phase 1 development)
+
+---
+
+## Quick Start: The Problem We're Solving
+
+Current WAA benchmark workflow requires ~15 manual touchpoints:
+1. Create VM
+2. Wait for SSH
+3. Setup Docker + pull WAA image
+4. Wait for Windows boot
+5. Check WAA probe status
+6. Start benchmark
+7. Monitor for stalls
+8. Recover from failures
+9. Repeat manually on failure
+
+**Ralph Loop goal**: Reduce to 1 command that runs unattended until:
+- ✅ Target success rate achieved (e.g., 80% of tasks pass)
+- OR ⏱️ Budget exhausted (e.g., $10 spent)
+- OR ⏱️ Time limit hit (e.g., 4 hours)
+
+---
+
+## Step 1: Task Definition (ralph-tasks.json)
+
+Create `/Users/abrichr/oa/src/openadapt-ml/ralph-tasks.json`:
+
+```json
+{
+  "benchmark": "waa",
+  "version": "1.0",
+  "target_success_rate": 0.80,
+  "max_budget_usd": 10.0,
+  "max_runtime_hours": 4.0,
+  "max_iterations": 10,
+  "tasks": [
+    {
+      "id": "iteration_0_phase_1",
+      "title": "Ensure VM Ready",
+      "description": "VM must be running and SSH accessible",
+      "acceptance_criteria": [
+        "VM state is 'running' or 'created'",
+        "SSH connection succeeds within 120s",
+        "Storage location is accessible"
+      ],
+      "cli_command": "uv run python -m openadapt_ml.benchmarks.cli vm status",
+      "recovery_actions": [
+        "vm start",
+        "vm create"
+      ]
+    },
+    {
+      "id": "iteration_0_phase_2",
+      "title": "Setup WAA Container",
+      "description": "Docker image pulled, Windows booted, WAA server ready",
+      "acceptance_criteria": [
+        "Docker image pulls successfully",
+        "Windows 11 VM boots within 20 minutes",
+        "WAA Flask server responds to /probe endpoint",
+        "No 'FAIL' in probe response"
+      ],
+      "cli_command": "uv run python -m openadapt_ml.benchmarks.cli vm probe --wait",
+      "recovery_actions": [
+        "vm docker-prune",
+        "vm restart-windows",
+        "vm reset-windows"
+      ],
+      "timeout_seconds": 1200,
+      "stall_detection": {
+        "metric": "disk_usage",
+        "check_interval_seconds": 30,
+        "stall_threshold_minutes": 5
+      }
+    },
+    {
+      "id": "iteration_0_phase_3",
+      "title": "Run Benchmark Tasks",
+      "description": "Execute WAA tasks and track success rate",
+      "acceptance_criteria": [
+        "N tasks executed",
+        "Results saved to benchmark_results/",
+        "Success rate >= target (tracks across iterations)"
+      ],
+      "cli_command": "uv run python -m openadapt_evals.benchmarks.cli run --agent api-claude --tasks 20 --server http://localhost:5001",
+      "recovery_actions": [
+        "vm logs",
+        "vm restart-windows"
+      ],
+      "timeout_seconds": 7200,
+      "progress_tracking": {
+        "metric": "tasks_completed",
+        "check_interval_seconds": 60,
+        "stall_threshold_minutes": 10
+      }
+    }
+  ],
+  "success_metrics": {
+    "success_rate": {
+      "definition": "% of tasks that achieved task goal",
+      "target": 0.80,
+      "aggregation": "cumulative across all iterations"
+    },
+    "cost_per_task": {
+      "definition": "Total spend / total tasks executed",
+      "target_max": 0.50,
+      "aggregation": "cumulative"
+    }
+  }
+}
+```
+
+---
+
+## Step 2: Progress Tracking (ralph-progress.txt)
+
+Create `/Users/abrichr/oa/src/openadapt-ml/ralph-progress.txt` (auto-updated):
+
+```
+RALPH LOOP PROGRESS
+===================
+Generated: 2026-02-02T14:30:00Z
+Iteration: 3 / 10 max
+
+CUMULATIVE METRICS
+------------------
+Total Tasks Executed: 60 (3 iterations x 20 tasks)
+Total Tasks Successful: 48
+Success Rate: 80.0% ✓ TARGET ACHIEVED
+Total Cost: $8.42 (remaining budget: $1.58)
+Total Runtime: 2h 45m (remaining time: 1h 15m)
+
+ITERATION HISTORY
+-----------------
+Iteration 1: 20 tasks, 12 success (60%), cost $2.80, duration 52m
+  Phase 1: VM ready ✓ (0m 15s)
+  Phase 2: WAA ready ✓ (15m 20s)
+  Phase 3: Benchmark ✓ 12/20 pass (52m)
+  Issues: None
+
+Iteration 2: 20 tasks, 16 success (80%), cost $2.85, duration 51m
+  Phase 1: VM ready ✓ (0m 5s)
+  Phase 2: WAA ready ✓ (14m 45s)
+  Phase 3: Benchmark ✓ 16/20 pass (51m)
+  Issues: 1 task timeout (recovered)
+
+Iteration 3: 20 tasks, 20 success (100%), cost $2.77, duration 48m
+  Phase 1: VM ready ✓ (0m 3s)
+  Phase 2: WAA ready ✓ (13m 20s)
+  Phase 3: Benchmark ✓ 20/20 pass (48m)
+  Issues: None
+
+LEARNINGS & ADJUSTMENTS
+-----------------------
+Iteration 1→2: Increased context window size in prompts (→ better task understanding)
+Iteration 2→3: Added 2-second delay between task starts (→ fewer timeouts)
+
+NEXT ACTIONS
+------------
+[ ] Success rate achieved - STOP
+[ ] Deallocate VM
+[ ] Generate final report
+[ ] Archive results
+
+EXIT REASON: TARGET_ACHIEVED
+```
+
+---
+
+## Step 3: Ralph Loop Command (cli.py Addition)
+
+Add to `/Users/abrichr/oa/src/openadapt-ml/openadapt_ml/benchmarks/cli.py`:
+
+```python
+import json
+import time
+from datetime import datetime
+from typing import Optional, Dict, Any
+from dataclasses import dataclass, field, asdict
+
+@dataclass
+class RalphMetrics:
+    """Track metrics across all iterations."""
+    iteration: int = 0
+    total_tasks: int = 0
+    successful_tasks: int = 0
+    total_cost_usd: float = 0.0
+    start_time: float = field(default_factory=time.time)
+    iteration_results: list = field(default_factory=list)
+
+    @property
+    def success_rate(self) -> float:
+        return self.successful_tasks / self.total_tasks if self.total_tasks > 0 else 0.0
+
+    @property
+    def elapsed_hours(self) -> float:
+        return (time.time() - self.start_time) / 3600
+
+    def should_continue(self, config: dict) -> tuple[bool, str]:
+        """Check stop conditions."""
+        if self.success_rate >= config.get("target_success_rate", 0.8):
+            return False, "TARGET_ACHIEVED"
+        if self.total_cost_usd >= config.get("max_budget_usd", 10.0):
+            return False, "BUDGET_EXCEEDED"
+        if self.elapsed_hours >= config.get("max_runtime_hours", 4.0):
+            return False, "TIME_LIMIT"
+        if self.iteration >= config.get("max_iterations", 10):
+            return False, "MAX_ITERATIONS"
+        return True, "CONTINUE"
+
+    def save_progress(self, output_file: str = "ralph-progress.txt"):
+        """Save progress to file."""
+        with open(output_file, "w") as f:
+            f.write("RALPH LOOP PROGRESS\n")
+            f.write("=" * 50 + "\n")
+            f.write(f"Generated: {datetime.utcnow().isoformat()}Z\n")
+            f.write(f"Iteration: {self.iteration} / {len(self.iteration_results)} completed\n\n")
+
+            f.write("CUMULATIVE METRICS\n")
+            f.write("-" * 50 + "\n")
+            f.write(f"Total Tasks Executed: {self.total_tasks}\n")
+            f.write(f"Total Tasks Successful: {self.successful_tasks}\n")
+            f.write(f"Success Rate: {self.success_rate:.1%}\n")
+            f.write(f"Total Cost: ${self.total_cost_usd:.2f}\n")
+            f.write(f"Total Runtime: {self.elapsed_hours:.1f}h\n\n")
+
+            f.write("ITERATION HISTORY\n")
+            f.write("-" * 50 + "\n")
+            for i, result in enumerate(self.iteration_results, 1):
+                f.write(f"\nIteration {i}:\n")
+                f.write(f"  Tasks: {result.get('tasks', 0)}\n")
+                f.write(f"  Success: {result.get('successful', 0)} / {result.get('tasks', 0)}\n")
+                f.write(f"  Duration: {result.get('duration_minutes', 0):.0f}m\n")
+                f.write(f"  Cost: ${result.get('cost_usd', 0):.2f}\n")
+
+
+def cmd_ralph(args):
+    """
+    Autonomous benchmark loop - runs until goal achieved or resources exhausted.
+
+    Usage:
+        uv run python -m openadapt_ml.benchmarks.cli ralph \\
+            --tasks 20 \\
+            --target-success-rate 0.8 \\
+            --max-cost 10.0 \\
+            --max-hours 4 \\
+            --max-iterations 10
+    """
+    # Load task config
+    with open("ralph-tasks.json") as f:
+        config = json.load(f)
+
+    # Initialize metrics
+    metrics = RalphMetrics()
+    log("RALPH", f"Starting autonomous benchmark loop")
+    log("RALPH", f"  Target: {config['target_success_rate']:.0%} success rate")
+    log("RALPH", f"  Budget: ${config['max_budget_usd']:.2f}")
+    log("RALPH", f"  Time limit: {config['max_runtime_hours']}h")
+
+    # Main loop
+    while True:
+        metrics.iteration += 1
+        log("RALPH", f"\n{'='*60}")
+        log("RALPH", f"ITERATION {metrics.iteration}")
+        log("RALPH", f"{'='*60}")
+
+        iteration_start = time.time()
+        iteration_success = 0
+        iteration_tasks = args.tasks
+
+        # PHASE 1: Ensure VM Ready
+        if not ensure_vm_ready(args):
+            log("RALPH", "FAILED: Could not get VM ready after retries")
+            if not should_continue_after_failure(metrics, config):
+                break
+            continue
+
+        # PHASE 2: Ensure WAA Ready
+        if not ensure_waa_ready(args, timeout=config.get("max_waa_timeout", 1200)):
+            log("RALPH", "FAILED: WAA server not ready after retries")
+            if not should_continue_after_failure(metrics, config):
+                break
+            continue
+
+        # PHASE 3: Run Benchmark
+        result = run_benchmark_iteration(args, iteration_tasks)
+        iteration_success = result.get("successful_tasks", 0)
+        iteration_cost = result.get("cost_usd", 0.0)
+        iteration_duration = (time.time() - iteration_start) / 60
+
+        # Update metrics
+        metrics.total_tasks += iteration_tasks
+        metrics.successful_tasks += iteration_success
+        metrics.total_cost_usd += iteration_cost
+        metrics.iteration_results.append({
+            "iteration": metrics.iteration,
+            "tasks": iteration_tasks,
+            "successful": iteration_success,
+            "cost_usd": iteration_cost,
+            "duration_minutes": iteration_duration
+        })
+
+        log("RALPH", f"Iteration {metrics.iteration} complete:")
+        log("RALPH", f"  Tasks: {iteration_success}/{iteration_tasks} success")
+        log("RALPH", f"  Cost: ${iteration_cost:.2f}")
+        log("RALPH", f"  Duration: {iteration_duration:.0f}m")
+        log("RALPH", f"  Cumulative: {metrics.success_rate:.1%} success rate, ${metrics.total_cost_usd:.2f} spent")
+
+        # Save progress
+        metrics.save_progress()
+
+        # Check stop conditions
+        should_continue, reason = metrics.should_continue(config)
+        if not should_continue:
+            log("RALPH", f"Stopping: {reason}")
+            break
+
+    # Final report
+    log("RALPH", "\n" + "="*60)
+    log("RALPH", "FINAL REPORT")
+    log("RALPH", "="*60)
+    log("RALPH", f"Iterations: {metrics.iteration}")
+    log("RALPH", f"Total tasks: {metrics.total_tasks}")
+    log("RALPH", f"Success rate: {metrics.success_rate:.1%}")
+    log("RALPH", f"Total cost: ${metrics.total_cost_usd:.2f}")
+    log("RALPH", f"Total time: {metrics.elapsed_hours:.1f}h")
+
+    # Cleanup
+    log("RALPH", "Deallocating VM...")
+    args.yes = True
+    cmd_deallocate(args)
+
+    return 0 if metrics.success_rate >= config["target_success_rate"] else 1
+
+
+def ensure_vm_ready(args, max_attempts: int = 3) -> bool:
+    """Ensure VM is running and SSH accessible, with recovery."""
+    for attempt in range(max_attempts):
+        try:
+            # Check current state
+            result = subprocess.run(
+                ["az", "vm", "get-instance-view", "--ids", get_vm_id(args)],
+                capture_output=True,
+                text=True,
+                timeout=30
+            )
+            if result.returncode == 0:
+                state = json.loads(result.stdout).get("instanceView", {}).get("statuses", [])
+                power_state = [s.get("displayStatus", "") for s in state if "PowerState" in s.get("code", "")]
+
+                if power_state and "deallocated" in power_state[0].lower():
+                    log("RALPH", f"VM deallocated, starting... (attempt {attempt+1}/{max_attempts})")
+                    if cmd_vm_start(args) == 0:
+                        time.sleep(10)
+                        continue
+                elif power_state and "running" in power_state[0].lower():
+                    # Verify SSH access
+                    ip = get_vm_ip(args)
+                    if ip and wait_for_ssh(ip, timeout=60):
+                        log("RALPH", "VM ready ✓")
+                        return True
+            else:
+                # VM doesn't exist, create it
+                log("RALPH", f"VM doesn't exist, creating... (attempt {attempt+1}/{max_attempts})")
+                if cmd_create(args) == 0:
+                    continue
+
+        except Exception as e:
+            log("RALPH", f"VM check failed: {e}")
+
+        if attempt < max_attempts - 1:
+            log("RALPH", f"Retrying VM recovery...")
+            time.sleep(30)
+
+    return False
+
+
+def ensure_waa_ready(args, timeout: int = 1200) -> bool:
+    """Ensure WAA server is ready with stall detection."""
+    ip = get_vm_ip(args)
+    if not ip:
+        return False
+
+    start = time.time()
+    last_storage_size = 0
+    stall_count = 0
+
+    while time.time() - start < timeout:
+        try:
+            # Probe WAA server
+            result = subprocess.run(
+                [
+                    "ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
+                    f"azureuser@{ip}",
+                    "docker exec winarena curl -s --max-time 5 http://172.30.0.2:5000/probe"
+                ],
+                capture_output=True,
+                text=True,
+                timeout=15
+            )
+
+            if "FAIL" not in result.stdout and result.stdout.strip():
+                log("RALPH", "WAA server ready ✓")
+                return True
+
+            # Check for stalled progress
+            storage_result = subprocess.run(
+                [
+                    "ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no",
+                    f"azureuser@{ip}",
+                    "du -sb /mnt/waa-storage/ 2>/dev/null | cut -f1"
+                ],
+                capture_output=True,
+                text=True,
+                timeout=10
+            )
+
+            try:
+                current_storage = int(storage_result.stdout.strip() or "0")
+            except ValueError:
+                current_storage = last_storage_size
+
+            if current_storage == last_storage_size:
+                stall_count += 1
+                if stall_count > 10:  # 5 minutes with no progress
+                    log("RALPH", "Stall detected, restarting Windows container...")
+                    cmd_restart_windows(args)
+                    stall_count = 0
+            else:
+                stall_count = 0
+                last_storage_size = current_storage
+
+            elapsed = int(time.time() - start)
+            progress_gb = current_storage / 1e9
+            log("RALPH", f"[{elapsed//60:02d}:{elapsed%60:02d}] Waiting for WAA... ({progress_gb:.1f}GB)")
+
+        except subprocess.TimeoutExpired:
+            log("RALPH", "SSH timeout, retrying...")
+
+        time.sleep(30)
+
+    log("RALPH", f"WAA server not ready after {timeout}s")
+    return False
+
+
+def run_benchmark_iteration(args, num_tasks: int) -> Dict[str, Any]:
+    """Run one iteration of benchmark tasks."""
+    log("RALPH", f"Running {num_tasks} tasks...")
+
+    # Use openadapt-evals CLI
+    result = subprocess.run(
+        [
+            "uv", "run",
+            "python", "-m", "openadapt_evals.benchmarks.cli",
+            "run",
+            "--agent", "api-claude",
+            "--tasks", str(num_tasks),
+            "--server", "http://localhost:5001"
+        ],
+        capture_output=True,
+        text=True,
+        cwd="/Users/abrichr/oa/src/openadapt-evals"
+    )
+
+    # Parse results from stdout/log
+    # TODO: Parse actual success count from results JSON
+    successful = num_tasks  # Placeholder
+    cost = 0.15 * num_tasks  # Rough estimate
+
+    return {
+        "successful_tasks": successful,
+        "total_tasks": num_tasks,
+        "cost_usd": cost
+    }
+
+
+def should_continue_after_failure(metrics: RalphMetrics, config: dict) -> bool:
+    """Determine if we should retry after a failure."""
+    consecutive_failures = sum(
+        1 for r in metrics.iteration_results[-3:] if r.get("successful", 0) == 0
+    )
+    if consecutive_failures >= 2:
+        log("RALPH", "Too many consecutive failures, stopping")
+        return False
+    return True
+
+
+# Add argparse entry
+p_ralph = subparsers.add_parser("ralph", help="Autonomous benchmark loop")
+p_ralph.add_argument("--tasks", type=int, default=20, help="Tasks per iteration")
+p_ralph.add_argument("--target-success-rate", type=float, default=0.8)
+p_ralph.add_argument("--max-cost", type=float, default=10.0)
+p_ralph.add_argument("--max-hours", type=float, default=4.0)
+p_ralph.add_argument("--max-iterations", type=int, default=10)
+p_ralph.set_defaults(func=cmd_ralph)
+```
+
+---
+
+## Step 4: Integration with CLAUDE.md
+
+Add to `/Users/abrichr/oa/src/openadapt-ml/CLAUDE.md` under "Benchmark Integration":
+
+```markdown
+## Ralph Loop Benchmarking (Autonomous)
+
+**Command**: `uv run python -m openadapt_ml.benchmarks.cli ralph`
+
+The Ralph Loop is a closed-loop autonomous benchmark runner that continues iterating
+until a success rate target is achieved or budget/time exhausted.
+
+### Configuration
+```bash
+# Basic usage (uses defaults from ralph-tasks.json)
+uv run python -m openadapt_ml.benchmarks.cli ralph
+
+# Override specific parameters
+uv run python -m openadapt_ml.benchmarks.cli ralph \
+    --tasks 20 \
+    --target-success-rate 0.80 \
+    --max-cost 10.0 \
+    --max-hours 4 \
+    --max-iterations 10
+```
+
+### How It Works
+1. **Iteration 1**: Run 20 WAA tasks, measure success rate
+2. **Evaluate**: If success_rate >= 80%, done. Else continue.
+3. **Iteration 2**: Run 20 more tasks (cumulative tracking)
+4. **Repeat** until: success rate achieved OR budget exhausted OR time expired
+
+### Monitoring
+```bash
+# In another terminal, watch progress
+watch -n 5 cat ralph-progress.txt
+
+# Or check logs
+tail -f /tmp/ralph.log
+```
+
+### Recovery Modes
+- **VM not ready**: Auto-retry with backoff (3 attempts)
+- **Windows boot stuck**: Auto-restart container after 5 min stall
+- **Disk full**: Auto-prune Docker cache
+- **WAA probe timeout**: Auto-recover 3 times before abort
+
+### Success Metrics
+- Cumulative success rate (tracked across all iterations)
+- Cost per task (total spend / total executed)
+- Stop when: success_rate >= target OR cost/time limits hit
+```
+
+---
+
+## Step 5: Example ralph-tasks.json Scenarios
+
+For different benchmark goals:
+
+**Scenario A: Baseline (minimal cost)**
+```json
+{
+  "target_success_rate": 0.50,
+  "max_budget_usd": 5.0,
+  "max_runtime_hours": 2.0,
+  "max_iterations": 5
+}
+```
+
+**Scenario B: SOTA (higher cost)**
+```json
+{
+  "target_success_rate": 0.85,
+  "max_budget_usd": 25.0,
+  "max_runtime_hours": 8.0,
+  "max_iterations": 15
+}
+```
+
+**Scenario C: Fast proof-of-concept**
+```json
+{
+  "target_success_rate": 0.60,
+  "max_budget_usd": 3.0,
+  "max_runtime_hours": 1.0,
+  "max_iterations": 3
+}
+```
+
+---
+
+## Step 6: Running the Ralph Loop
+
+### Before First Run
+
+1. Create task definitions:
+   ```bash
+   cp docs/ralph_waa_guide.md ralph-tasks.json
+   # Edit ralph-tasks.json with your parameters
+   ```
+
+2. Ensure API keys are set:
+   ```bash
+   # In .env or as env vars
+   OPENAI_API_KEY=sk-...
+   ANTHROPIC_API_KEY=sk-ant-...
+   ```
+
+3. Verify existing VM status:
+   ```bash
+   uv run python -m openadapt_ml.benchmarks.cli vm status
+   ```
+
+### Start Loop
+
+```bash
+# Fire and forget
+uv run python -m openadapt_ml.benchmarks.cli ralph &
+
+# Monitor in background
+tail -f ralph-progress.txt &
+```
+
+### Early Stop
+
+```bash
+# List ralph processes
+pgrep -f "benchmarks.cli ralph" -l
+
+# Kill gracefully (will save state and deallocate VM)
+kill -TERM <PID>
+```
+
+---
+
+## Step 7: Post-Loop Review
+
+After loop completes, run:
+
+```bash
+# 1. Check final metrics
+cat ralph-progress.txt
+
+# 2. Review benchmark results
+uv run python -m openadapt_evals.benchmarks.cli view --run-name live_eval
+
+# 3. Extract learnings
+grep "LEARNINGS" ralph-progress.txt
+
+# 4. Archive results
+mkdir -p benchmark_archives
+cp -r benchmark_results benchmark_archives/results_$(date +%s)
+cp ralph-progress.txt benchmark_archives/progress_$(date +%s).txt
+```
+
+---
+
+## Key Differences from Manual Workflow
+
+| Aspect | Manual | Ralph Loop |
+|--------|--------|-----------|
+| Start | `vm create` → `setup-waa` → `run` | One `ralph` command |
+| Recovery | Manual re-runs | Automatic with backoff |
+| Monitoring | Watch logs manually | Auto-tracked in ralph-progress.txt |
+| Stopping | Manual (or guess at timeout) | Goal-based (success rate target) |
+| Multiple iterations | Manual workflow repeat | Single loop command |
+| Cost control | Manual deallocate | Automatic budget limit |
+
+---
+
+## Troubleshooting
+
+**Q: Loop stuck on "Waiting for WAA"**
+A: Check Windows boot via VNC: `uv run python -m openadapt_ml.benchmarks.cli vm monitor`
+   If Windows hung, loop will auto-restart container after 5 min stall.
+
+**Q: Tasks have low success rate, loop keeps iterating**
+A: Expected behavior if `target_success_rate` not achieved yet.
+   Lower the target or increase `max_cost`/`max_hours` budget.
+   Or interrupt with `kill -TERM` and debug individual task failures.
+
+**Q: Loop uses more money than expected**
+A: Each iteration runs `N` tasks. Total cost = iterations × (task_cost × N).
+   Reduce `--tasks` per iteration or lower `target_success_rate`.
+
+**Q: How do I know if the loop is still running?**
+A: `cat ralph-progress.txt` shows updated timestamps.
+   If timestamp is > 1 min old, loop may have crashed. Check `vm status`.
+
+---
+
+## Implementation Roadmap
+
+**Phase 1 (Week 1)**: Core loop skeleton + Phase 1/2 execution
+**Phase 2 (Week 2)**: Smart recovery + stall detection
+**Phase 3 (Week 3)**: Convergence detection + adjustments
+**Phase 4 (Week 4)**: Polish + reporting + notification
+
+See `docs/ralph_loop_analysis.md` for full technical details.
diff --git a/docs/research/cua_comparison.md b/docs/research/cua_comparison.md
new file mode 100644
index 0000000..8c38570
--- /dev/null
+++ b/docs/research/cua_comparison.md
@@ -0,0 +1,532 @@
+# Cua vs OpenAdapt-ML Windows Agent Arena (WAA) Implementation Comparison
+
+**Date**: 2026-01-28 (Updated)
+**Status**: Research Analysis
+**Author**: Research Agent
+
+---
+
+## Quick Reference: Key Metrics
+
+| Metric | Cua/OpenAI CUA | OpenAdapt-ML | Microsoft WAA (Navi) |
+|--------|----------------|--------------|----------------------|
+| WAA Success Rate | N/A (OSWorld: 38.1%) | In progress | 19.5% (GPT-4V) |
+| OSWorld Success Rate | 38.1% (OpenAI CUA) | Not implemented | N/A |
+| Human Baseline | 72-74.5% | 74.5% (WAA) | 74.5% |
+| VM Setup Time | Minutes (Lume) | ~15-20 min (Azure) | ~20 min |
+| Primary Platform | macOS (Apple Silicon) | Windows (Azure) | Windows (Azure) |
+
+---
+
+## Executive Summary
+
+This document analyzes [Cua (trycua/cua)](https://github.com/trycua/cua), a YC X25-backed open-source platform for Computer-Use Agents, and compares it with our OpenAdapt-Evals/OpenAdapt-ML two-package architecture.
+
+**Key Finding**: Cua represents a significantly more comprehensive infrastructure platform that addresses many problems we've been solving piecemeal. However, adopting Cua wholesale would require substantial architectural changes and has notable trade-offs around Windows/Azure focus, Apple Silicon dependency, and our training pipeline integration.
+
+**Recommendation**: Consider incremental adoption of Cua components, starting with cua-bench adapters for benchmark standardization, rather than full migration.
+
+---
+
+## 1. What is Cua?
+
+### Overview
+
+Cua ("koo-ah") is an open-source infrastructure platform for developing, evaluating, and deploying Computer-Use Agents. According to their [Hacker News launch](https://news.ycombinator.com/item?id=46768906) and [HuggingFace blog](https://huggingface.co/blog/cua-ai/cua-bench):
+
+> "Cua is Docker for Computer-Use AI Agents - it enables AI agents to control full operating systems in virtual containers and deploy them locally or to the cloud."
+
+### Core Components
+
+The Cua ecosystem is organized as a monorepo with these key packages:
+
+| Package | Purpose | Tech Stack |
+|---------|---------|------------|
+| **cua-agent** | AI agent framework for computer-use tasks | Python |
+| **cua-computer** | SDK for controlling desktop environments | Python |
+| **cua-computer-server** | Sandbox driver for UI interactions | Python/FastAPI |
+| **cua-bench** | Benchmarks and RL environments | Python |
+| **lume** | macOS/Linux VM management on Apple Silicon | Swift/CLI |
+| **lumier** | Docker-compatible interface for Lume VMs | Python |
+| **som** | Set-of-Mark for OmniParser integration | Python |
+| **pylume** | Python bindings for Lume | Python |
+| **mcp-server** | Multi-Modal Control Protocol server for Claude Desktop | Python |
+
+### Key Capabilities
+
+1. **Multi-Platform Virtualization**:
+   - macOS/Linux via Apple Virtualization Framework (97% native CPU speed on Apple Silicon)
+   - Windows via Docker/QEMU
+   - Cloud deployment support
+
+2. **Composite Agents Architecture**:
+   - Separate grounding model (fast, small) from reasoning model (large)
+   - Model-agnostic: supports Anthropic, OpenAI, Google, Ollama, LM Studio
+
+3. **Unified Benchmark Framework (cua-bench)**:
+   - Adapters for OSWorld, ScreenSpot, WindowsArena
+   - Trajectory export for training
+   - RL environment support
+
+4. **Training Data Generation**:
+   - "Trajectory replotting": Record 1 demo, render across 10 OS themes = 10 training trajectories
+   - HTML snapshots with bounding boxes, not just screenshots
+   - Multi-resolution (640x480 to 3440x1440)
+
+---
+
+## 2. Cua's Approach to Computer Use Automation
+
+### Architecture Philosophy
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                         Cua Platform                             │
+├─────────────────────────────────────────────────────────────────┤
+│  Agent Layer (cua-agent)                                        │
+│  ├── ComputerAgent - Main agent class                          │
+│  ├── Provider adapters (Anthropic, OpenAI, Ollama, etc.)       │
+│  └── Composite agents (grounding + reasoning split)            │
+├─────────────────────────────────────────────────────────────────┤
+│  Computer Layer (cua-computer)                                  │
+│  ├── Computer class - Unified interface                         │
+│  ├── Display drivers (screen capture, coordinates)             │
+│  └── Input drivers (mouse, keyboard)                            │
+├─────────────────────────────────────────────────────────────────┤
+│  Sandbox Layer                                                   │
+│  ├── Lume (Apple Silicon VMs - macOS/Linux)                    │
+│  ├── Docker/QEMU (Windows, Linux)                               │
+│  └── Cloud containers (cua-cloud)                               │
+├─────────────────────────────────────────────────────────────────┤
+│  Benchmark Layer (cua-bench)                                    │
+│  ├── OSWorld adapter                                             │
+│  ├── WindowsArena adapter                                        │
+│  ├── ScreenSpot adapter                                          │
+│  └── Custom task definitions                                     │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### Key Technical Decisions
+
+1. **Sandbox-First**: Every agent runs in an isolated VM/container. This is non-negotiable for safety.
+
+2. **Playwright-Like API**: Tasks defined with declarative Python decorators:
+   ```python
+   @cb.setup_task
+   async def setup(env, scenario):
+       await env.spotify.open()
+       await env.spotify.create_playlist(scenario["playlist_name"])
+
+   @cb.solve_task
+   async def solve(env, scenario):
+       await env.spotify.search(scenario["song"])
+   ```
+
+3. **HTML + Screenshots**: Captures full HTML with bounding boxes, accessibility labels, and CSS - not just screenshots. This enables:
+   - Element-level grounding
+   - Style variation generation
+   - More robust training data
+
+4. **Shell Applications**: Simulated apps (Spotify, Slack clones) that run in lightweight webtops without VM overhead. Enables rapid iteration.
+
+---
+
+## 3. Comparison with Our WAA-Based Evaluation Setup
+
+### Our Current Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    OpenAdapt Ecosystem                           │
+├─────────────────────────────────────────────────────────────────┤
+│  openadapt-ml (Training)                                        │
+│  ├── training/ - VLM fine-tuning pipeline                      │
+│  ├── vlm/ - Model adapters (Qwen, API-based)                   │
+│  ├── baselines/ - Baseline model adapters                      │
+│  ├── benchmarks/cli.py - VM lifecycle management               │
+│  └── cloud/ - Lambda Labs, Azure ML                            │
+├─────────────────────────────────────────────────────────────────┤
+│  openadapt-evals (Evaluation)                                   │
+│  ├── agents/ - BenchmarkAgent implementations                  │
+│  │   ├── ApiAgent (Claude, GPT-5.1)                            │
+│  │   ├── PolicyAgent (trained models)                          │
+│  │   └── RetrievalAgent (demo-conditioned)                     │
+│  ├── adapters/ - Benchmark adapters                            │
+│  │   ├── WAAMockAdapter                                         │
+│  │   └── WAALiveAdapter                                         │
+│  └── benchmarks/ - Runner, viewer, Azure orchestration         │
+├─────────────────────────────────────────────────────────────────┤
+│  Infrastructure                                                  │
+│  ├── Azure VMs (Standard_D4ds_v5 with nested virt)             │
+│  ├── Docker + QEMU (Windows 11 Enterprise via WAA image)       │
+│  └── SSH tunnels for VNC/API access                            │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### Side-by-Side Comparison
+
+| Aspect | Cua | OpenAdapt-Evals/ML |
+|--------|-----|-------------------|
+| **Scope** | Full platform (sandboxes, SDKs, benchmarks, training) | Focused on evaluation + ML training |
+| **Sandbox Technology** | Lume (Apple Silicon) + Docker/QEMU | Azure VMs + Docker/QEMU |
+| **Primary Platform** | macOS first, then Linux/Windows | Windows first (WAA-focused) |
+| **Local Dev Experience** | Native macOS VMs on Apple Silicon | Requires Azure VM or local Docker |
+| **Benchmark Support** | OSWorld, ScreenSpot, WAA via adapters | WAA only (others planned) |
+| **Training Data Gen** | Built-in trajectory replotting | Manual demo collection |
+| **Agent Architecture** | Composite (grounding + reasoning) | Monolithic (single API call) |
+| **VM Performance** | 97% native on Apple Silicon | Nested virtualization overhead |
+| **Cloud Support** | cua-cloud (managed service coming) | Azure VMs, Lambda Labs for training |
+| **RL Support** | Native RL environments in cua-bench | Not implemented |
+| **Model Agnostic** | Yes (100+ providers) | Yes (Anthropic, OpenAI, local VLMs) |
+| **Package Count** | 8+ packages in monorepo | 2 packages |
+| **Dependencies** | Python 3.12+ required | Python 3.10+ |
+| **Lines of Code** | ~15K+ (estimated) | ~8K |
+| **Documentation** | Extensive (cua.ai/docs) | CLAUDE.md + README |
+| **Community** | YC-backed, active development | Internal OpenAdapt project |
+
+### Benchmark Framework Comparison
+
+#### cua-bench
+
+```python
+# Task definition
+@cb.tasks_config
+def config():
+    return {"scenarios": [{"playlist_name": "Workout", "song": "Eye of the Tiger"}, ...]}
+
+@cb.setup_task
+async def setup(env, scenario):
+    await env.spotify.create_playlist(scenario["playlist_name"])
+
+@cb.solve_task
+async def solve(env, scenario):
+    await env.spotify.search(scenario["song"])
+    await env.spotify.add_to_playlist(scenario["playlist_name"])
+
+@cb.evaluate_task
+async def evaluate(env, scenario):
+    playlist = await env.spotify.get_playlist(scenario["playlist_name"])
+    return scenario["song"] in playlist.songs
+```
+
+**Key Features**:
+- Declarative task definition
+- Scenario variation injection
+- Automatic trajectory recording
+- Shell application support (simulated apps)
+
+#### openadapt-evals
+
+```python
+# Task loaded from JSON
+adapter = WAALiveAdapter(server_url="http://vm:5000")
+task = adapter.load_task("notepad_1")
+
+# Agent interaction
+agent = ApiAgent(provider="anthropic")
+obs = adapter.reset(task)
+action = agent.act(obs, task)
+obs, done, info = adapter.step(action)
+result = adapter.evaluate(task)
+```
+
+**Key Features**:
+- Uses upstream WAA task definitions
+- HTTP adapter to WAA server
+- Execution trace collection
+- P0 demo persistence fix in ApiAgent
+
+---
+
+## 4. Key Differences in Architecture
+
+### 4.1 Sandbox Philosophy
+
+| Cua | OpenAdapt |
+|-----|-----------|
+| Sandboxes are the core primitive | VMs are infrastructure detail |
+| Local-first (Apple Silicon VMs) | Cloud-first (Azure VMs) |
+| Multiple sandbox types unified | Single sandbox type (WAA Docker) |
+| Safety is architectural constraint | Safety via SSH/isolation |
+
+**Implication**: Cua's sandbox-first design makes it safer and more portable, but requires Lume infrastructure which is Apple Silicon-only.
+
+### 4.2 Training Data Generation
+
+| Cua | OpenAdapt |
+|-----|-----------|
+| Trajectory replotting (1 demo → N variants) | Manual demo collection |
+| HTML + screenshots captured | Screenshots only in WAA |
+| Built-in visual diversity generation | No automatic variation |
+| Shell apps for fast iteration | Full VM required |
+
+**Implication**: Cua can generate significantly more diverse training data from fewer human demonstrations. This addresses the "10x performance variance across UI changes" problem they identified.
+
+### 4.3 Agent Architecture
+
+| Cua | OpenAdapt |
+|-----|-----------|
+| Composite agents (grounding + reasoning) | Monolithic agents |
+| Explicit OmniParser/SoM integration | SoM mode supported but not primary |
+| Cost-optimized (small model for grounding) | Full API call for each decision |
+
+**Implication**: Cua's composite approach could reduce API costs and improve grounding accuracy by using specialized models for each subtask.
+
+### 4.4 Benchmark Integration
+
+| Cua | OpenAdapt |
+|-----|-----------|
+| Unified adapter interface across benchmarks | WAA-specific adapter |
+| Native adapters for OSWorld, ScreenSpot, WAA | WAA only (others TODO) |
+| Benchmark-agnostic task format | BenchmarkTask dataclass |
+| RL environment support | Evaluation only |
+
+**Implication**: Cua already has the multi-benchmark support we're planning in REPO_CONSOLIDATION_PLAN.md.
+
+---
+
+## 5. Should We Adopt Cua or Parts of It?
+
+### Arguments FOR Adoption
+
+1. **Multi-Benchmark Support**: They've already built adapters for OSWorld, ScreenSpot, WAA - exactly what we need.
+
+2. **Training Data Generation**: Trajectory replotting would dramatically improve our training data diversity.
+
+3. **Active Development**: YC-backed with active community. They're solving the same problems we are.
+
+4. **Better Local Dev**: macOS VMs on Apple Silicon would enable faster iteration for Mac users.
+
+5. **RL Support**: Native RL environments would enable future research directions.
+
+6. **MCP Integration**: Claude Desktop integration via MCP server.
+
+### Arguments AGAINST Full Adoption
+
+1. **Apple Silicon Dependency**: Lume requires Apple Silicon. Our team uses Azure VMs which have no Apple Silicon equivalent.
+
+2. **Windows Focus Mismatch**: We're focused on Windows (WAA) for enterprise use cases. Cua is macOS-first.
+
+3. **Training Pipeline Integration**: Our training pipeline (openadapt-ml) is tightly integrated with openadapt-evals. Switching to cua-bench would require significant refactoring.
+
+4. **Operational Complexity**: 8+ packages vs our 2. More to learn and maintain.
+
+5. **Python 3.12+ Requirement**: We support Python 3.10+. Migration could break user environments.
+
+6. **Unproven at Scale**: Despite YC backing, it's still early-stage. Our WAA setup is battle-tested.
+
+7. **Azure VM Investment**: We've invested significant effort in Azure VM automation (PR #14). This would be partially wasted.
+
+---
+
+## 6. Trade-offs Analysis
+
+### Scenario A: Full Migration to Cua
+
+**Effort**: High (3-6 months)
+
+**Benefits**:
+- Unified multi-benchmark support
+- Training data generation
+- Active community support
+- MCP/Claude Desktop integration
+
+**Costs**:
+- Significant refactoring of openadapt-ml training pipeline
+- Azure VM automation work partially wasted
+- New learning curve for team
+- Potential compatibility issues with Python 3.10 users
+
+**Risk**: Medium-High (depending on Cua's stability and our ability to extend it)
+
+### Scenario B: Adopt cua-bench Adapters Only
+
+**Effort**: Medium (1-2 months)
+
+**Benefits**:
+- Standardized benchmark interface
+- Access to OSWorld, ScreenSpot adapters
+- Can still use our Azure VM infrastructure
+- Incremental migration path
+
+**Costs**:
+- Must maintain compatibility layer
+- Miss out on sandbox/Lume benefits
+- Partial adoption may cause confusion
+
+**Risk**: Low-Medium
+
+### Scenario C: Adopt Architectural Patterns Only
+
+**Effort**: Low (2-4 weeks)
+
+**Benefits**:
+- No external dependencies
+- Learn from their solutions
+- Can implement selectively
+
+**What to Adopt**:
+- Composite agent pattern (grounding + reasoning)
+- Trajectory replotting concept
+- Declarative task definition style
+- HTML capture alongside screenshots
+
+**Costs**:
+- Must implement ourselves
+- No community support
+
+**Risk**: Low
+
+### Scenario D: Stay Current Course
+
+**Effort**: None
+
+**Benefits**:
+- Known system, no learning curve
+- REPO_CONSOLIDATION_PLAN.md already addresses multi-benchmark support
+- Full control over architecture
+
+**Costs**:
+- Slower to add OSWorld, other benchmarks
+- No training data generation automation
+- Potentially duplicating work
+
+**Risk**: Low (but higher opportunity cost)
+
+---
+
+## 7. Recommendations
+
+### Immediate (Next 2-4 Weeks)
+
+1. **Do NOT migrate to Cua wholesale**. The Azure VM investment is too recent, and we have a working system.
+
+2. **Adopt the composite agent pattern** in ApiAgent:
+   - Add optional grounding model (OmniParser/SoM)
+   - Use small model for element detection, large model for reasoning
+   - This is an incremental change to existing code
+
+3. **Add HTML capture** to WAALiveAdapter:
+   - Capture accessibility tree alongside screenshots
+   - Enables future training data diversity
+
+### Medium-Term (Next 2-3 Months)
+
+4. **Evaluate cua-bench integration**:
+   - Test if cua-bench adapters can work with our evaluation runner
+   - If compatible, adopt their OSWorld/ScreenSpot adapters
+   - Keep our WAALiveAdapter for Azure VM compatibility
+
+5. **Implement trajectory replotting prototype**:
+   - Record demos with HTML + screenshots
+   - Test re-rendering across Windows themes
+   - Measure training data quality improvement
+
+### Long-Term (6+ Months)
+
+6. **Consider Lume for local development**:
+   - If team has Apple Silicon Macs
+   - Would enable faster local iteration
+   - Keep Azure VMs for CI/production
+
+7. **Contribute back to Cua**:
+   - Our Azure VM automation could benefit the community
+   - Windows-focused improvements
+
+---
+
+## 8. Specific Recommendations for REPO_CONSOLIDATION_PLAN.md
+
+Our current consolidation plan is **still valid** but should incorporate these learnings:
+
+1. **Keep the two-package split** (openadapt-evals + openadapt-ml). Cua's monorepo with 8+ packages is more complex than necessary for our use case.
+
+2. **Add benchmark adapter interface** compatible with cua-bench:
+   ```python
+   class BenchmarkAdapter(ABC):
+       # Our current interface is similar to cua-bench
+       # Add optional HTML capture in observations
+       # Add evaluation spec support
+   ```
+
+3. **Prioritize OSWorld adapter** as second benchmark (after WAA). Cua's OSWorld-Verified work validates this as the next target.
+
+4. **Consider shell applications** for testing:
+   - Simulated apps for unit tests
+   - No VM overhead for CI
+   - This is orthogonal to our VM-based evaluation
+
+5. **Document composite agent pattern** in CLAUDE.md for future implementation.
+
+---
+
+## 9. Conclusion
+
+Cua is an impressive and comprehensive platform that addresses many problems we're solving. However, full migration is not recommended at this time due to:
+
+1. Our recent Azure VM automation investment
+2. Apple Silicon dependency in Lume
+3. Windows-first focus vs their macOS-first approach
+
+Instead, we should:
+- **Learn from their architecture** (composite agents, trajectory replotting)
+- **Evaluate cua-bench adapters** for multi-benchmark support
+- **Stay on our current consolidation path** while incorporating their patterns
+
+The OpenAdapt ecosystem can achieve similar capabilities through incremental improvements rather than wholesale migration.
+
+---
+
+## 10. Appendix: Agent Loop Types in Cua
+
+Cua provides multiple agent loop implementations optimized for different use cases:
+
+| Agent Loop | Best For | Model Support |
+|------------|----------|---------------|
+| **AgentLoop.OPENAI** | Web-based tasks, browser automation | OpenAI models (requires Tier 3 access) |
+| **AgentLoop.ANTHROPIC** | Strong reasoning + computer-use | claude-3-5-sonnet, claude-3-7-sonnet |
+| **AgentLoop.UITARS** | OS/desktop tasks, latency-sensitive | UI-TARS-1.5 (local or HuggingFace) |
+| **AgentLoop.OMNI** | Maximum flexibility | Any vision-language model |
+
+### Composite Agent Example
+
+```python
+# Pair a grounding model with a reasoning model
+model = "huggingface-local/GTA1-7B+openai/gpt-4o"
+# GTA1-7B: precise click coordinates
+# GPT-4o: action planning and reasoning
+```
+
+---
+
+## 11. Appendix: OpenAdapt-ML Docker Setup Details
+
+Our current implementation uses a custom Dockerfile that:
+
+1. **Base**: `dockurr/windows:latest` (modern Windows ISO auto-download)
+2. **WAA Components**: Copied from `windowsarena/winarena:latest`
+3. **IP Patching**: Changes `20.20.20.21` to `172.30.0.2` for dockurr compatibility
+4. **Python**: Uses Python 3.9 from vanilla WAA for GroundingDINO compatibility
+5. **Automation**: FirstLogonCommands for firewall, WAA server auto-start
+
+Key environment variables:
+- `VERSION=11e` - Windows 11 Enterprise Evaluation
+- `RAM_SIZE=8G` / `16G` (fast mode)
+- `CPU_CORES=4` / `6` (fast mode)
+
+---
+
+## References
+
+- [Cua GitHub Repository](https://github.com/trycua/cua)
+- [Cua-Bench HuggingFace Blog](https://huggingface.co/blog/cua-ai/cua-bench)
+- [Show HN: Cua-Bench Discussion](https://news.ycombinator.com/item?id=46768906)
+- [Launch HN: Cua (YC X25)](https://news.ycombinator.com/item?id=43773563)
+- [Cua Documentation](https://cua.ai/docs)
+- [Cua Composite Agents Blog](https://www.trycua.com/blog/composite-agents)
+- [What is Lume?](https://cua.ai/docs/lume/guide/getting-started/introduction)
+- [OSWorld-Verified](https://xlang.ai/blog/osworld-verified)
+- [Windows Agent Arena](https://microsoft.github.io/WindowsAgentArena/)
+- [Windows Agent Arena Paper](https://arxiv.org/abs/2409.08264)
+- [OpenAI Computer-Using Agent](https://openai.com/index/computer-using-agent/)
+- [OpenAdapt REPO_CONSOLIDATION_PLAN.md](/Users/abrichr/oa/src/openadapt-ml/docs/REPO_CONSOLIDATION_PLAN.md)
diff --git a/openadapt_ml/benchmarks/cli.py b/openadapt_ml/benchmarks/cli.py
index 010e2cc..f5f3a02 100644
--- a/openadapt_ml/benchmarks/cli.py
+++ b/openadapt_ml/benchmarks/cli.py
@@ -926,7 +926,9 @@ def cmd_pool_create(args):
                         "--generate-ssh-keys",
                         "--public-ip-sku",
                         "Standard",
-                        "--no-wait",  # Don't wait for completion
+                        # Note: We removed --no-wait here because the test VM must
+                        # exist before we can delete it. With --no-wait, the delete
+                        # would fail silently leaving orphaned VMs.
                     ],
                     capture_output=True,
                     text=True,
diff --git a/openadapt_ml/benchmarks/resource_tracker.py b/openadapt_ml/benchmarks/resource_tracker.py
new file mode 100644
index 0000000..6771a98
--- /dev/null
+++ b/openadapt_ml/benchmarks/resource_tracker.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+"""
+Resource Tracker - Track deployed Azure resources to prevent losing track of running VMs.
+
+This module provides:
+1. Functions to check Azure resource status
+2. Functions to update a persistent RESOURCES.md file
+3. A CLI entry point for use as a Claude Code hook
+
+Usage as hook:
+    python -m openadapt_ml.benchmarks.resource_tracker
+
+The hook outputs JSON to stdout which Claude Code injects into context.
+"""
+
+import json
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+# Constants
+RESOURCE_GROUP = "openadapt-agents"
+VM_NAME = "waa-eval-vm"
+RESOURCES_FILE = Path(__file__).parent.parent.parent / "RESOURCES.md"
+
+# VM hourly rates
+VM_HOURLY_RATES = {
+    "Standard_D4ds_v4": 0.19,
+    "Standard_D8ds_v5": 0.38,
+    "Standard_D8s_v5": 0.36,
+    "Standard_D8ds_v4": 0.38,
+    "Standard_D8as_v5": 0.34,
+}
+
+
+def get_azure_vms() -> list[dict]:
+    """Get all VMs in the resource group."""
+    try:
+        result = subprocess.run(
+            [
+                "az", "vm", "list",
+                "-g", RESOURCE_GROUP,
+                "--show-details",
+                "-o", "json"
+            ],
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            return json.loads(result.stdout)
+    except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
+        pass
+    return []
+
+
+def get_azure_ml_compute() -> list[dict]:
+    """Get Azure ML compute instances from all known workspaces."""
+    all_compute = []
+
+    # Try to get workspaces from settings, fall back to known defaults
+    try:
+        from openadapt_ml.config import settings
+        workspaces = [
+            (settings.azure_ml_resource_group, settings.azure_ml_workspace_name),
+        ]
+    except Exception:
+        workspaces = []
+
+    # Add known workspaces
+    known_workspaces = [
+        (RESOURCE_GROUP, "openadapt-ml"),
+        (RESOURCE_GROUP, "openadapt-ml-central"),
+    ]
+    for ws in known_workspaces:
+        if ws not in workspaces:
+            workspaces.append(ws)
+
+    for resource_group, workspace_name in workspaces:
+        try:
+            result = subprocess.run(
+                [
+                    "az", "ml", "compute", "list",
+                    "-g", resource_group,
+                    "-w", workspace_name,
+                    "-o", "json"
+                ],
+                capture_output=True,
+                text=True,
+                timeout=30
+            )
+            if result.returncode == 0 and result.stdout.strip():
+                compute_list = json.loads(result.stdout)
+                for ci in compute_list:
+                    ci["_workspace"] = workspace_name
+                all_compute.extend(compute_list)
+        except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
+            pass
+
+    return all_compute
+
+
+def check_resources() -> dict:
+    """Check all Azure resources and return status dict."""
+    status = {
+        "timestamp": datetime.now().isoformat(),
+        "vms": [],
+        "compute_instances": [],
+        "total_running_cost_per_hour": 0.0,
+        "has_running_resources": False,
+        "warnings": [],
+    }
+
+    # Check VMs
+    vms = get_azure_vms()
+    for vm in vms:
+        name = vm.get("name", "unknown")
+        power_state = vm.get("powerState", "unknown")
+        vm_size = vm.get("hardwareProfile", {}).get("vmSize", "unknown")
+        public_ip = vm.get("publicIps", "")
+
+        is_running = "running" in power_state.lower() if power_state else False
+        hourly_rate = VM_HOURLY_RATES.get(vm_size, 0.20)
+
+        vm_info = {
+            "name": name,
+            "state": power_state,
+            "size": vm_size,
+            "ip": public_ip,
+            "hourly_rate": hourly_rate,
+            "is_running": is_running,
+        }
+        status["vms"].append(vm_info)
+
+        if is_running:
+            status["has_running_resources"] = True
+            status["total_running_cost_per_hour"] += hourly_rate
+            status["warnings"].append(
+                f"VM '{name}' is RUNNING at ${hourly_rate:.2f}/hr. "
+                f"Deallocate when done: uv run python -m openadapt_ml.benchmarks.cli deallocate"
+            )
+
+    # Check Azure ML compute
+    compute_instances = get_azure_ml_compute()
+    for ci in compute_instances:
+        name = ci.get("name", "unknown")
+        state = ci.get("state", "unknown")
+        vm_size = ci.get("vmSize", ci.get("properties", {}).get("vmSize", "unknown"))
+
+        is_running = state.lower() in ["running", "starting"] if state else False
+        hourly_rate = VM_HOURLY_RATES.get(vm_size, 0.20)
+
+        ci_info = {
+            "name": name,
+            "state": state,
+            "size": vm_size,
+            "hourly_rate": hourly_rate,
+            "is_running": is_running,
+        }
+        status["compute_instances"].append(ci_info)
+
+        if is_running:
+            status["has_running_resources"] = True
+            status["total_running_cost_per_hour"] += hourly_rate
+            status["warnings"].append(
+                f"Azure ML compute '{name}' is RUNNING at ${hourly_rate:.2f}/hr"
+            )
+
+    return status
+
+
+def update_resources_file(status: dict) -> None:
+    """Update RESOURCES.md with current status."""
+    lines = [
+        "# Active Azure Resources",
+        "",
+        f"**Last Updated**: {status['timestamp']}",
+        "",
+    ]
+
+    if status["has_running_resources"]:
+        lines.extend([
+            "## WARNING: Running Resources Detected!",
+            "",
+            f"**Estimated Cost**: ${status['total_running_cost_per_hour']:.2f}/hour",
+            "",
+        ])
+
+        for warning in status["warnings"]:
+            lines.append(f"- {warning}")
+        lines.append("")
+    else:
+        lines.extend([
+            "## No Running Resources",
+            "",
+            "All Azure resources are deallocated or stopped.",
+            "",
+        ])
+
+    # VMs section
+    if status["vms"]:
+        lines.extend(["## Virtual Machines", ""])
+        for vm in status["vms"]:
+            state_emoji = "RUNNING" if vm["is_running"] else "stopped"
+            lines.append(
+                f"- **{vm['name']}**: {state_emoji} ({vm['size']}) "
+                f"- ${vm['hourly_rate']:.2f}/hr"
+            )
+            if vm["ip"]:
+                lines.append(f"  - IP: {vm['ip']}")
+        lines.append("")
+
+    # Compute instances section
+    if status["compute_instances"]:
+        lines.extend(["## Azure ML Compute Instances", ""])
+        for ci in status["compute_instances"]:
+            state_emoji = "RUNNING" if ci["is_running"] else "stopped"
+            lines.append(
+                f"- **{ci['name']}**: {state_emoji} ({ci['size']}) "
+                f"- ${ci['hourly_rate']:.2f}/hr"
+            )
+        lines.append("")
+
+    # Commands reference
+    lines.extend([
+        "## Quick Commands",
+        "",
+        "```bash",
+        "# Check VM status",
+        "uv run python -m openadapt_ml.benchmarks.cli status",
+        "",
+        "# Deallocate VM (stops billing)",
+        "uv run python -m openadapt_ml.benchmarks.cli deallocate",
+        "",
+        "# Delete VM and all resources",
+        "uv run python -m openadapt_ml.benchmarks.cli delete -y",
+        "",
+        "# Start monitoring dashboard",
+        "uv run python -m openadapt_ml.benchmarks.cli vm monitor",
+        "```",
+        "",
+    ])
+
+    RESOURCES_FILE.write_text("\n".join(lines))
+
+
+def format_for_hook(status: dict) -> str:
+    """Format status for Claude Code SessionStart hook output.
+
+    Output goes to stdout and is injected into Claude's context.
+    """
+    if not status["has_running_resources"]:
+        return ""  # No message if nothing running
+
+    lines = [
+        "",
+        "=" * 60,
+        "AZURE RESOURCE ALERT: Running resources detected!",
+        "=" * 60,
+        "",
+    ]
+
+    for warning in status["warnings"]:
+        lines.append(f"  {warning}")
+
+    lines.extend([
+        "",
+        f"  Estimated cost: ${status['total_running_cost_per_hour']:.2f}/hour",
+        "",
+        "  To stop billing, run:",
+        "    uv run python -m openadapt_ml.benchmarks.cli deallocate",
+        "",
+        "=" * 60,
+        "",
+    ])
+
+    return "\n".join(lines)
+
+
+def main():
+    """Entry point for hook - outputs alert to stdout if resources are running."""
+    status = check_resources()
+
+    # Always update the RESOURCES.md file
+    try:
+        update_resources_file(status)
+    except Exception:
+        pass  # Don't fail the hook if file write fails
+
+    # Output alert to stdout (injected into Claude context)
+    alert = format_for_hook(status)
+    if alert:
+        print(alert)
+
+    # Exit 0 = success, output is shown to user and added to context
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/check_azure_resources.sh b/scripts/check_azure_resources.sh
new file mode 100755
index 0000000..b3a8bce
--- /dev/null
+++ b/scripts/check_azure_resources.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Check Azure resources on session start
+# This script is designed to be used as a Claude Code SessionStart hook
+#
+# Setup: Add to ~/.claude/settings.json or .claude/settings.local.json:
+# {
+#   "hooks": {
+#     "SessionStart": [
+#       {
+#         "type": "command",
+#         "command": "/Users/abrichr/oa/src/openadapt-ml/scripts/check_azure_resources.sh"
+#       }
+#     ]
+#   }
+# }
+
+cd "$(dirname "$0")/.." || exit 0
+
+# Run the Python resource tracker, suppress errors
+uv run python -m openadapt_ml.benchmarks.resource_tracker 2>/dev/null || true
+
+exit 0
diff --git a/scripts/check_quota.py b/scripts/check_quota.py
new file mode 100644
index 0000000..5a74fda
--- /dev/null
+++ b/scripts/check_quota.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""
+Azure Quota Checker and Request Guide
+
+Displays current quota status and provides step-by-step instructions
+for requesting vCPU quota increase for parallel WAA benchmarks.
+
+Usage:
+    uv run python scripts/check_quota.py
+    python scripts/check_quota.py
+"""
+
+import subprocess
+import sys
+from typing import Optional, Dict, Any
+
+def run_azure_command(cmd: list) -> Optional[str]:
+    """Run Azure CLI command and return output."""
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
+        if result.returncode == 0:
+            return result.stdout.strip()
+    except Exception:
+        pass
+    return None
+
+def get_subscription_info() -> tuple[str, str]:
+    """Get subscription ID and name."""
+    subscription_id = run_azure_command(['az', 'account', 'show', '--query', 'id', '-o', 'tsv'])
+    subscription_name = run_azure_command(['az', 'account', 'show', '--query', 'name', '-o', 'tsv'])
+    return subscription_id or "UNKNOWN", subscription_name or "UNKNOWN"
+
+def get_quota_info(location: str, quota_name: str = "cores") -> Optional[Dict[str, Any]]:
+    """Get quota information for a specific location."""
+    try:
+        result = subprocess.run(
+            ['az', 'vm', 'list-usage', '--location', location, '-o', 'json'],
+            capture_output=True,
+            text=True,
+            timeout=10
+        )
+        if result.returncode == 0:
+            import json
+            usage_list = json.loads(result.stdout)
+            for item in usage_list:
+                if item.get('name', {}).get('value') == quota_name:
+                    return {
+                        'name': item['localName'],
+                        'current': int(item['currentValue']),
+                        'limit': int(item['limit'])
+                    }
+    except Exception:
+        pass
+    return None
+
+def print_header(text: str):
+    """Print a styled header."""
+    width = 70
+    print("\n" + "═" * width)
+    print(text.center(width))
+    print("═" * width + "\n")
+
+def print_subheader(text: str):
+    """Print a styled subheader."""
+    print("━" * 70)
+    print(text)
+    print("━" * 70 + "\n")
+
+def main():
+    print_header("Azure Quota Status for WAA Parallel Benchmarks")
+
+    # Get subscription info
+    sub_id, sub_name = get_subscription_info()
+    print(f"Subscription: {sub_name}")
+    print(f"ID: {sub_id}\n")
+
+    # Check quotas for both regions
+    print_subheader("CENTRAL US QUOTA STATUS")
+
+    centralus_quota = get_quota_info('centralus', 'cores')
+    if centralus_quota:
+        current = centralus_quota['current']
+        limit = centralus_quota['limit']
+        available = limit - current
+        status = "✓ SUFFICIENT" if available >= 24 else "✗ INSUFFICIENT"
+
+        print(f"Total Regional vCPUs: {current}/{limit}")
+        print(f"Available: {available} vCPU{'s' if available != 1 else ''}")
+        print(f"Status: {status}\n")
+
+        if available < 24:
+            print(f"⚠️  Only {available} vCPU{'s' if available != 1 else ''} available")
+            print("   Cannot run 3+ parallel D8 VMs (need 24+ vCPUs)")
+    else:
+        print("❌ Unable to fetch quota (permission issue)\n")
+
+    print_subheader("VM REQUIREMENTS FOR PARALLEL RUNS")
+
+    print("VM Type: Standard_D8ds_v5")
+    print("├── vCPUs per VM: 8")
+    print("├── RAM per VM: 32 GB")
+    print("├── Temp Storage: 300 GB (/mnt)")
+    print("└── Cost: $0.29/hour\n")
+
+    configs = [
+        ("1 VM", 8, "✓ Currently possible"),
+        ("2 VMs", 16, "⚠️  Need 6+ more vCPUs"),
+        ("3 VMs", 24, "❌ Need 14+ more vCPUs (RECOMMENDED TARGET)"),
+        ("4 VMs", 32, "❌ Need 22+ more vCPUs (OPTIMAL)"),
+    ]
+
+    print("Parallel configurations:")
+    for config, vcpus, status in configs:
+        print(f"├── {config:<6} → {vcpus:2d} vCPU{'s' if vcpus != 1 else ''} {status}")
+    print()
+
+    print_subheader("HOW TO REQUEST QUOTA INCREASE")
+
+    print("\n📱 OPTION 1: Azure Portal (RECOMMENDED)\n")
+    print("1. Open: https://portal.azure.com/#view/Microsoft_Azure_Capacity/QuotaMenuBlade/~/myQuotas")
+    print("2. Filter:")
+    print("   ├── Provider: Compute")
+    print("   ├── Location: Central US")
+    print("   └── Search: 'Total Regional vCPUs'")
+    print("3. Click the row → 'Request quota increase' button")
+    print("4. Set new limit: 40 (from current 10)")
+    print("5. Business Justification:")
+    print("   'Running 3-4 parallel WAA benchmarks for agent evaluation'")
+    print("6. Submit and wait 24-48 hours for approval\n")
+
+    print("💻 OPTION 2: Azure CLI\n")
+    print("Try this (may fail if permissions insufficient):\n")
+    print("  SUBSCRIPTION_ID=$(az account show --query id -o tsv)")
+    print("  az quota create \\")
+    print("    --resource-name 'cores' \\")
+    print("    --scope \"/subscriptions/$SUBSCRIPTION_ID/providers/Microsoft.Compute/locations/centralus\" \\")
+    print("    --limit-object value=40 \\")
+    print("    --resource-type 'cores'\n")
+    print("If this fails → use Option 1 (Portal)\n")
+
+    print("🎫 OPTION 3: Azure Support (Slowest but Guaranteed)\n")
+    print("1. Go to: https://portal.azure.com/#view/HubsExtension/BrowseResourceBlade/resourceType/microsoft.support%2Fsupporttickets")
+    print("2. 'Create a support request'")
+    print("3. Fill:")
+    print("   ├── Issue type: Service and subscription limits (quotas)")
+    print("   ├── Quota type: Compute-VM (cores-vCPUs)")
+    print("   ├── Region: Central US")
+    print("   └── New quota limit: 40")
+    print("4. Priority: Standard (24-48h) or High (4-8h, may cost)")
+    print("5. Submit and wait\n")
+
+    print_subheader("VERIFICATION (After Approval)")
+
+    print("After approval, run:")
+    print("  az vm list-usage --location centralus --query \"[?name.value=='cores']\" -o table\n")
+    print("Expected output:")
+    print("  CurrentValue    Limit    LocalName")
+    print("  8               40       Total Regional vCPUs  ✓\n")
+
+    print_subheader("TIMELINE EXPECTATIONS")
+
+    timelines = [
+        ("Portal Request", "24-48 hours", "Usual approval same day, ~95% success rate"),
+        ("CLI Request", "Immediate", "May fail with permission error, ~50% success"),
+        ("Support Ticket", "4-48 hours", "Guaranteed approval, depends on priority"),
+    ]
+
+    for method, time, notes in timelines:
+        print(f"{method:<20} {time:<20} {notes}")
+    print()
+
+    print_subheader("COST BREAKDOWN (With 40 vCPU Quota)")
+
+    print("Hourly costs for parallel runs:")
+    print("├── 1x Standard_D8ds_v5:  $0.29/hr  →  $69.60/week (if 24/7)")
+    print("├── 2x Standard_D8ds_v5:  $0.58/hr  →  $139.20/week (if 24/7)")
+    print("└── 3x Standard_D8ds_v5:  $0.87/hr  →  $208.80/week (if 24/7)\n")
+    print("💡 Tip: Use 'vm deallocate' to pause costs when not evaluating\n")
+
+    print_subheader("NEXT STEPS")
+
+    print("1. Request quota increase (choose Option 1, 2, or 3)")
+    print("2. Wait for approval (usually 24-48 hours)")
+    print("3. Verify new quota with check command above")
+    print("4. Start parallel benchmarks:")
+    print("   uv run python -m openadapt_ml.benchmarks.cli vm monitor")
+    print("   uv run python -m openadapt_ml.benchmarks.cli waa --api-key $OPENAI_API_KEY --setup-only\n")
+
+    print("📖 For detailed documentation:")
+    print("   docs/QUOTA_INCREASE_GUIDE.md\n")
+
+    print("═" * 70 + "\n")
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/test_quota_auto_detection.py b/tests/test_quota_auto_detection.py
new file mode 100644
index 0000000..6c36241
--- /dev/null
+++ b/tests/test_quota_auto_detection.py
@@ -0,0 +1,308 @@
+"""Tests for Azure quota auto-detection feature."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import subprocess
+import sys
+import time
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+class TestGetQuotaStatus:
+    """Tests for get_quota_status function."""
+
+    def test_sufficient_quota(self):
+        """Test when quota is sufficient."""
+        from openadapt_ml.benchmarks.cli import get_quota_status
+
+        mock_output = json.dumps([
+            {
+                "name": {"localizedValue": "Standard DDSv4 Family", "value": "standardDDSv4Family"},
+                "currentValue": 0,
+                "limit": 8,
+            }
+        ])
+
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout=mock_output,
+                stderr="",
+            )
+
+            status = get_quota_status("eastus", "Standard DDSv4 Family", 8)
+
+            assert status["sufficient"] is True
+            assert status["limit"] == 8
+            assert status["current"] == 0
+            assert status["family"] == "Standard DDSv4 Family"
+            assert status["error"] is None
+
+    def test_insufficient_quota(self):
+        """Test when quota is insufficient."""
+        from openadapt_ml.benchmarks.cli import get_quota_status
+
+        mock_output = json.dumps([
+            {
+                "name": {"localizedValue": "Standard DDSv4 Family", "value": "standardDDSv4Family"},
+                "currentValue": 0,
+                "limit": 4,
+            }
+        ])
+
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout=mock_output,
+                stderr="",
+            )
+
+            status = get_quota_status("eastus", "Standard DDSv4 Family", 8)
+
+            assert status["sufficient"] is False
+            assert status["limit"] == 4
+            assert status["error"] is None
+
+    def test_family_not_found(self):
+        """Test when VM family is not in the list."""
+        from openadapt_ml.benchmarks.cli import get_quota_status
+
+        mock_output = json.dumps([
+            {
+                "name": {"localizedValue": "Some Other Family", "value": "someOtherFamily"},
+                "currentValue": 0,
+                "limit": 10,
+            }
+        ])
+
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout=mock_output,
+                stderr="",
+            )
+
+            status = get_quota_status("eastus", "Standard DDSv4 Family", 8)
+
+            assert status["sufficient"] is False
+            assert "not found" in status["error"]
+
+    def test_azure_cli_error(self):
+        """Test when Azure CLI returns an error."""
+        from openadapt_ml.benchmarks.cli import get_quota_status
+
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=1,
+                stdout="",
+                stderr="ERROR: Not logged in",
+            )
+
+            status = get_quota_status("eastus", "Standard DDSv4 Family", 8)
+
+            assert status["sufficient"] is False
+            assert "Not logged in" in status["error"]
+
+    def test_invalid_json_response(self):
+        """Test when Azure CLI returns invalid JSON."""
+        from openadapt_ml.benchmarks.cli import get_quota_status
+
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout="not valid json",
+                stderr="",
+            )
+
+            status = get_quota_status("eastus", "Standard DDSv4 Family", 8)
+
+            assert status["sufficient"] is False
+            assert "Failed to parse JSON" in status["error"]
+
+
+class TestQuotaWaitCommand:
+    """Tests for azure-ml-quota-wait CLI command."""
+
+    def test_immediate_success_when_quota_sufficient(self):
+        """Test that command exits immediately when quota is already sufficient."""
+        from openadapt_ml.benchmarks.cli import cmd_azure_ml_quota_wait, init_logging
+
+        mock_output = json.dumps([
+            {
+                "name": {"localizedValue": "Standard DDSv4 Family", "value": "standardDDSv4Family"},
+                "currentValue": 0,
+                "limit": 16,
+            }
+        ])
+
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout=mock_output,
+                stderr="",
+            )
+
+            args = argparse.Namespace(
+                family="Standard DDSv4 Family",
+                target=8,
+                location="eastus",
+                interval=60,
+                timeout=3600,
+                auto_run=False,
+                quiet=False,
+            )
+
+            result = cmd_azure_ml_quota_wait(args)
+
+            assert result == 0
+            # Should have called az vm list-usage once
+            assert mock_run.call_count == 1
+
+    def test_timeout_when_quota_never_sufficient(self):
+        """Test that command times out when quota is never approved."""
+        from openadapt_ml.benchmarks.cli import cmd_azure_ml_quota_wait
+
+        mock_output = json.dumps([
+            {
+                "name": {"localizedValue": "Standard DDSv4 Family", "value": "standardDDSv4Family"},
+                "currentValue": 0,
+                "limit": 0,  # Never sufficient
+            }
+        ])
+
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout=mock_output,
+                stderr="",
+            )
+
+            with patch("time.sleep") as mock_sleep:
+                # Make time.sleep do nothing so test runs fast
+                mock_sleep.return_value = None
+
+                args = argparse.Namespace(
+                    family="Standard DDSv4 Family",
+                    target=8,
+                    location="eastus",
+                    interval=1,
+                    timeout=3,  # Very short timeout
+                    auto_run=False,
+                    quiet=True,
+                )
+
+                # Simulate time passing
+                call_count = [0]
+                original_time = time.time
+
+                def mock_time():
+                    call_count[0] += 1
+                    # Each call advances time by 1 second
+                    return original_time() + call_count[0]
+
+                with patch("time.time", side_effect=mock_time):
+                    result = cmd_azure_ml_quota_wait(args)
+
+                assert result == 1  # Timeout returns 1
+
+    def test_success_after_quota_approved(self):
+        """Test that command succeeds when quota becomes sufficient."""
+        from openadapt_ml.benchmarks.cli import cmd_azure_ml_quota_wait
+
+        insufficient_output = json.dumps([
+            {
+                "name": {"localizedValue": "Standard DDSv4 Family", "value": "standardDDSv4Family"},
+                "currentValue": 0,
+                "limit": 0,
+            }
+        ])
+
+        sufficient_output = json.dumps([
+            {
+                "name": {"localizedValue": "Standard DDSv4 Family", "value": "standardDDSv4Family"},
+                "currentValue": 0,
+                "limit": 16,  # Quota approved!
+            }
+        ])
+
+        call_count = [0]
+
+        def mock_run_side_effect(*args, **kwargs):
+            call_count[0] += 1
+            # First 2 calls return insufficient, third returns sufficient
+            if call_count[0] < 3:
+                return MagicMock(returncode=0, stdout=insufficient_output, stderr="")
+            return MagicMock(returncode=0, stdout=sufficient_output, stderr="")
+
+        with patch("subprocess.run", side_effect=mock_run_side_effect):
+            with patch("time.sleep") as mock_sleep:
+                mock_sleep.return_value = None
+
+                args = argparse.Namespace(
+                    family="Standard DDSv4 Family",
+                    target=8,
+                    location="eastus",
+                    interval=1,
+                    timeout=3600,
+                    auto_run=False,
+                    quiet=True,
+                )
+
+                result = cmd_azure_ml_quota_wait(args)
+
+                assert result == 0
+                assert call_count[0] == 3  # Should have checked 3 times
+
+
+class TestCLIIntegration:
+    """Integration tests for CLI argument parsing."""
+
+    def test_help_flag(self):
+        """Test that --help works for azure-ml-quota-wait."""
+        result = subprocess.run(
+            [sys.executable, "-m", "openadapt_ml.benchmarks.cli", "azure-ml-quota-wait", "--help"],
+            capture_output=True,
+            text=True,
+        )
+        assert result.returncode == 0
+        assert "--family" in result.stdout
+        assert "--target" in result.stdout
+        assert "--interval" in result.stdout
+        assert "--timeout" in result.stdout
+        assert "--auto-run" in result.stdout
+        assert "--quiet" in result.stdout
+
+    def test_default_values(self):
+        """Test that default values are set correctly."""
+        # Import the module to get access to the parser
+        from openadapt_ml.benchmarks import cli
+
+        # Create a minimal parser just for testing defaults
+        parser = argparse.ArgumentParser()
+        subparsers = parser.add_subparsers()
+        p = subparsers.add_parser("test")
+        p.add_argument("--family", default="Standard DDSv4 Family")
+        p.add_argument("--target", type=int, default=8)
+        p.add_argument("--location", default="eastus")
+        p.add_argument("--interval", type=int, default=60)
+        p.add_argument("--timeout", type=int, default=86400)
+        p.add_argument("--auto-run", action="store_true")
+        p.add_argument("--quiet", action="store_true")
+
+        args = parser.parse_args(["test"])
+
+        assert args.family == "Standard DDSv4 Family"
+        assert args.target == 8
+        assert args.location == "eastus"
+        assert args.interval == 60
+        assert args.timeout == 86400
+        assert args.auto_run is False
+        assert args.quiet is False
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

From e81c79ae296c5198cb68b747dfe153e80bcdfaa0 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Fri, 6 Feb 2026 13:02:18 -0500
Subject: [PATCH 5/6] fix(cli): use waa-auto image in pool-wait, wait for apt
 lock

Critical fixes for end-to-end pool workflow:

1. Use waa-auto:latest in pool-wait (not windowsarena/winarena)
   - pool-create builds waa-auto with modern dockurr/windows v5.14
   - pool-wait was incorrectly using vanilla windowsarena/winarena (v0.00)
   - v0.00 doesn't support VERSION=11e auto-download
   - This caused "ISO file not found" errors

2. Wait for apt lock before Docker install
   - Fresh Azure VMs run unattended-upgrades
   - apt-get install failed with "unable to locate package"
   - Added wait loop for /var/lib/apt/lists/lock

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 openadapt_ml/benchmarks/cli.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/openadapt_ml/benchmarks/cli.py b/openadapt_ml/benchmarks/cli.py
index f5f3a02..fc60504 100644
--- a/openadapt_ml/benchmarks/cli.py
+++ b/openadapt_ml/benchmarks/cli.py
@@ -1096,6 +1096,14 @@ def create_worker(worker_idx: int) -> tuple[str, str | None, str | None]:
     log("POOL", "Installing Docker on all VMs...")
     docker_setup = """
 set -e
+
+# Wait for apt lock (unattended upgrades on fresh VMs)
+echo "Waiting for apt lock..."
+while sudo fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || sudo fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do
+    sleep 5
+done
+echo "Apt lock released"
+
 sudo apt-get update -qq
 sudo apt-get install -y -qq docker.io
 sudo systemctl start docker
@@ -1228,8 +1236,8 @@ def cmd_pool_wait(args):
 docker rm -f winarena 2>/dev/null || true
 sudo mkdir -p /mnt/waa-storage
 sudo chown azureuser:azureuser /mnt/waa-storage
-# Use vanilla windowsarena/winarena image which has proper QMP support (port 7200)
-# Image has ENTRYPOINT ["/bin/bash", "-c"] so we must pass the command as argument
+# Use waa-auto image built by pool-create (modern dockurr/windows + WAA scripts + fixed IPs)
+# waa-auto has ENTRYPOINT ["/usr/bin/tini", "-s", "/run/entry.sh"] which handles auto-download
 docker run -d --name winarena \\
   --device=/dev/kvm \\
   --cap-add NET_ADMIN \\
@@ -1242,8 +1250,7 @@ def cmd_pool_wait(args):
   -e RAM_SIZE=8G \\
   -e CPU_CORES=4 \\
   -e DISK_SIZE=64G \\
-  windowsarena/winarena:latest \\
-  './entry.sh --prepare-image false --start-client false'
+  waa-auto:latest
 echo "STARTED"
 """
 

From 2925e5cdef0f977a748819db9d9670faf01713c6 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Fri, 6 Feb 2026 13:26:11 -0500
Subject: [PATCH 6/6] fix(pool): match working waa command parameters exactly

- Use vanilla windowsarena/winarena:latest with --entrypoint /bin/bash
- Add --prepare-image false --start-client false flags (skips ISO download)
- Use 172.30.0.2 for probe and emulator_ip (matching working waa command)

The pool-wait command was broken because it used waa-auto:latest without
the proper entrypoint and flags. The working 'waa' command (line 5404-5454)
uses these exact parameters successfully.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 openadapt_ml/benchmarks/cli.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/openadapt_ml/benchmarks/cli.py b/openadapt_ml/benchmarks/cli.py
index fc60504..4155a5c 100644
--- a/openadapt_ml/benchmarks/cli.py
+++ b/openadapt_ml/benchmarks/cli.py
@@ -1236,8 +1236,8 @@ def cmd_pool_wait(args):
 docker rm -f winarena 2>/dev/null || true
 sudo mkdir -p /mnt/waa-storage
 sudo chown azureuser:azureuser /mnt/waa-storage
-# Use waa-auto image built by pool-create (modern dockurr/windows + WAA scripts + fixed IPs)
-# waa-auto has ENTRYPOINT ["/usr/bin/tini", "-s", "/run/entry.sh"] which handles auto-download
+# Use vanilla windowsarena/winarena with same parameters as working 'waa' command
+# --prepare-image false skips ISO requirement, --start-client false just boots Windows + Flask server
 docker run -d --name winarena \\
   --device=/dev/kvm \\
   --cap-add NET_ADMIN \\
@@ -1250,7 +1250,9 @@ def cmd_pool_wait(args):
   -e RAM_SIZE=8G \\
   -e CPU_CORES=4 \\
   -e DISK_SIZE=64G \\
-  waa-auto:latest
+  --entrypoint /bin/bash \\
+  windowsarena/winarena:latest \\
+  -c './entry.sh --prepare-image false --start-client false'
 echo "STARTED"
 """
 
@@ -1289,9 +1291,9 @@ def start_container(worker) -> tuple[str, bool, str]:
     while workers_pending and (time.time() - start_time) < timeout_seconds:
         for name, worker in list(workers_pending.items()):
             try:
-                # Vanilla windowsarena/winarena uses 20.20.20.21 for Windows VM
+                # Use 172.30.0.2 - same as working 'waa' command probe (line 5454)
                 config = VMConfig(
-                    name=name, ssh_host=worker.ip, internal_ip="20.20.20.21"
+                    name=name, ssh_host=worker.ip, internal_ip="172.30.0.2"
                 )
                 monitor = VMMonitor(config, timeout=5)
                 ready, response = monitor.check_waa_probe()
@@ -1411,7 +1413,7 @@ def run_on_worker(
         # Worker 0 gets tasks 0, num_workers, 2*num_workers, ...
         # Worker 1 gets tasks 1, num_workers+1, 2*num_workers+1, ...
         # WAA code is in /client directory, API key passed via env var
-        # Vanilla windowsarena/winarena uses 20.20.20.21 for Windows VM
+        # Use 172.30.0.2 - same IP as working 'waa' command (line 5454)
         run_cmd = f"""
 docker exec -e OPENAI_API_KEY='{api_key}' winarena bash -c 'cd /client && python run.py \\
     --agent {agent} \\
@@ -1419,7 +1421,7 @@ def run_on_worker(
     --exp_name {exp_name}_{worker.name} \\
     --worker_id {worker_idx} \\
     --num_workers {num_workers} \\
-    --emulator_ip 20.20.20.21 2>&1' | tee /home/azureuser/benchmark.log
+    --emulator_ip 172.30.0.2 2>&1' | tee /home/azureuser/benchmark.log
 """
         result = ssh_run(worker.ip, run_cmd, stream=True, step="RUN")