From 9b4f45c79d87fb72266371f4f7917c2bc002e83f Mon Sep 17 00:00:00 2001 From: MagellaX Date: Mon, 26 Jan 2026 14:06:45 +0530 Subject: [PATCH 1/5] Add task validation command --- hud/cli/__init__.py | 12 +++++++++ hud/cli/validate.py | 48 ++++++++++++++++++++++++++++++++++ hud/tests/test_validate_cli.py | 43 ++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+) create mode 100644 hud/cli/validate.py create mode 100644 hud/tests/test_validate_cli.py diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py index 02b788d78..6067cff64 100644 --- a/hud/cli/__init__.py +++ b/hud/cli/__init__.py @@ -30,6 +30,7 @@ from .pull import pull_command from .push import push_command from .remove import remove_command +from .validate import validate_command from .utils.config import set_env_values from .utils.cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config from .utils.logging import CaptureLogger @@ -790,6 +791,17 @@ def quickstart() -> None: clone("https://github.com/hud-evals/quickstart.git") +@app.command() +def validate( + source: str = typer.Argument( # type: ignore[arg-type] # noqa: B008 + ..., + help="Tasks file path or dataset slug (e.g. ./tasks.json or hud-evals/SheetBench-50)", + ), +) -> None: + """Validate task files or datasets without running them.""" + validate_command(source) + + @app.command() def eval( source: str | None = typer.Argument( diff --git a/hud/cli/validate.py b/hud/cli/validate.py new file mode 100644 index 000000000..19fbe2a9e --- /dev/null +++ b/hud/cli/validate.py @@ -0,0 +1,48 @@ +"""Validate task files or datasets.""" + +from __future__ import annotations + +from typing import Any + +import typer +from pydantic import ValidationError + +from hud.types import Task +from hud.utils.tasks import load_tasks +from hud.utils.hud_console import hud_console + + +def validate_command(source: str) -> None: + """Validate tasks from a file or HuggingFace dataset.""" + try: + raw_tasks = load_tasks(source, raw=True) + except Exception as e: + hud_console.error(f"Failed to load tasks: {e}") + raise typer.Exit(1) from e + + errors: list[str] = [] + for idx, task in enumerate(raw_tasks): + label = task.get("id") or f"index {idx}" + try: + Task(**_as_dict(task)) + except ValidationError as e: + errors.append(f"{label}: {e}") + except Exception as e: + errors.append(f"{label}: {e}") + + if errors: + hud_console.error(f"Found {len(errors)} invalid task(s).") + for err in errors: + hud_console.error(f"- {err}") + raise typer.Exit(1) + + hud_console.success(f"Validated {len(raw_tasks)} task(s).") + + +def _as_dict(task: Any) -> dict[str, Any]: + if isinstance(task, dict): + return task + try: + return dict(task) + except Exception: + return {} diff --git a/hud/tests/test_validate_cli.py b/hud/tests/test_validate_cli.py new file mode 100644 index 000000000..cf1f65f2f --- /dev/null +++ b/hud/tests/test_validate_cli.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import importlib.util +import json +from pathlib import Path + +import pytest +import typer + + +def _load_validate_command(): + module_path = Path(__file__).resolve().parents[1] / "cli" / "validate.py" + spec = importlib.util.spec_from_file_location("hud.cli.validate", module_path) + module = importlib.util.module_from_spec(spec) # type: ignore[arg-type] + assert spec and spec.loader + spec.loader.exec_module(module) + return module.validate_command + + +def _write_tasks(path: Path, tasks: list[dict]) -> str: + path.write_text(json.dumps(tasks), encoding="utf-8") + return str(path) + + +def test_validate_command_valid(tmp_path: Path) -> None: + validate_command = _load_validate_command() + tasks = [ + { + "prompt": "Say hello", + "mcp_config": {"local": {"command": "echo", "args": ["hi"]}}, + "evaluate_tool": {"name": "done", "arguments": {}}, + } + ] + path = _write_tasks(tmp_path / "tasks.json", tasks) + validate_command(path) + + +def test_validate_command_invalid(tmp_path: Path) -> None: + validate_command = _load_validate_command() + tasks = [{"mcp_config": {"local": {"command": "echo", "args": ["hi"]}}}] + path = _write_tasks(tmp_path / "tasks.json", tasks) + with pytest.raises(typer.Exit): + validate_command(path) From edcd13df9d58fa11aac981514da53cb67c2a6dbf Mon Sep 17 00:00:00 2001 From: MagellaX Date: Mon, 26 Jan 2026 14:26:29 +0530 Subject: [PATCH 2/5] Validate: flag non-object task entries --- hud/cli/validate.py | 56 +++++++++++++++++++++++++++++++++- hud/tests/test_validate_cli.py | 11 +++++++ 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/hud/cli/validate.py b/hud/cli/validate.py index 19fbe2a9e..8b50005c7 100644 --- a/hud/cli/validate.py +++ b/hud/cli/validate.py @@ -2,6 +2,8 @@ from __future__ import annotations +import json +from pathlib import Path from typing import Any import typer @@ -15,12 +17,13 @@ def validate_command(source: str) -> None: """Validate tasks from a file or HuggingFace dataset.""" try: - raw_tasks = load_tasks(source, raw=True) + raw_tasks, type_errors = _load_raw_tasks(source) except Exception as e: hud_console.error(f"Failed to load tasks: {e}") raise typer.Exit(1) from e errors: list[str] = [] + errors.extend(type_errors) for idx, task in enumerate(raw_tasks): label = task.get("id") or f"index {idx}" try: @@ -46,3 +49,54 @@ def _as_dict(task: Any) -> dict[str, Any]: return dict(task) except Exception: return {} + + +def _load_raw_tasks(source: str) -> tuple[list[dict[str, Any]], list[str]]: + path = Path(source) + if path.exists() and path.suffix.lower() in {".json", ".jsonl"}: + return _load_raw_from_file(path) + return load_tasks(source, raw=True), [] + + +def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]: + errors: list[str] = [] + items: list[dict[str, Any]] = [] + + if path.suffix.lower() == ".jsonl": + with open(path, encoding="utf-8") as f: + for line_no, line in enumerate(f, start=1): + line = line.strip() + if not line: + continue + value = json.loads(line) + if isinstance(value, dict): + items.append(value) + continue + if isinstance(value, list): + for idx, entry in enumerate(value): + if isinstance(entry, dict): + items.append(entry) + else: + errors.append( + f"line {line_no} item {idx}: expected object, got {type(entry).__name__}" + ) + continue + errors.append( + f"line {line_no}: expected object or list, got {type(value).__name__}" + ) + return items, errors + + with open(path, encoding="utf-8") as f: + value = json.load(f) + + if isinstance(value, dict): + return [value], errors + if isinstance(value, list): + for idx, entry in enumerate(value): + if isinstance(entry, dict): + items.append(entry) + else: + errors.append(f"index {idx}: expected object, got {type(entry).__name__}") + return items, errors + + raise ValueError(f"JSON file must contain an object or array, got {type(value).__name__}") diff --git a/hud/tests/test_validate_cli.py b/hud/tests/test_validate_cli.py index cf1f65f2f..52d10d335 100644 --- a/hud/tests/test_validate_cli.py +++ b/hud/tests/test_validate_cli.py @@ -41,3 +41,14 @@ def test_validate_command_invalid(tmp_path: Path) -> None: path = _write_tasks(tmp_path / "tasks.json", tasks) with pytest.raises(typer.Exit): validate_command(path) + + +def test_validate_command_flags_non_dict_entries(tmp_path: Path) -> None: + validate_command = _load_validate_command() + tasks = [ + {"prompt": "ok", "mcp_config": {"local": {"command": "echo", "args": ["hi"]}}}, + "not a task", + ] + path = _write_tasks(tmp_path / "tasks.json", tasks) + with pytest.raises(typer.Exit): + validate_command(path) From fa8aade041ffa1edda4831ad709c14a5628b8d24 Mon Sep 17 00:00:00 2001 From: MagellaX Date: Mon, 26 Jan 2026 15:40:26 +0530 Subject: [PATCH 3/5] Fix eval CLI selection and strict v4 validation --- hud/cli/__init__.py | 16 +++++++--------- hud/cli/validate.py | 12 +++++++++++- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py index 5096c068a..0ba34b21e 100644 --- a/hud/cli/__init__.py +++ b/hud/cli/__init__.py @@ -21,7 +21,6 @@ from .debug import debug_mcp_stdio from .deploy import deploy_command from .dev import run_mcp_dev_server -from .eval import eval_command from .link import link_command from .pull import pull_command from .push import push_command @@ -1103,6 +1102,7 @@ def eval( # If no agent specified, fetch available models and prompt for selection base_model = None + hud_model_base_map: dict[str, str] = {} if agent is None: # Get available HUD models first hud_models = get_available_models() @@ -1113,7 +1113,7 @@ def eval( # Add HUD models as agent choices for hud_model in hud_models: model_name = hud_model["name"] - base_model = hud_model["base_model"] + hud_model_base_map[model_name] = hud_model["base_model"] vllm_status = " ⚡" if hud_model.get("vllm_url") else "" choices.append({"name": f"{model_name}{vllm_status}", "value": f"{model_name}"}) @@ -1139,11 +1139,11 @@ def eval( # Set model to base model for the vllm endpoint if not base_model: - hud_models = get_available_models() - for hud_model in hud_models: - if hud_model["name"] == model: - base_model = hud_model["base_model"] - break + if not hud_model_base_map: + hud_models = get_available_models() + for hud_model in hud_models: + hud_model_base_map[hud_model["name"]] = hud_model["base_model"] + base_model = hud_model_base_map.get(model) if not base_model: hud_console.error(f"Model {model} not found") raise typer.Exit(1) @@ -1176,8 +1176,6 @@ def eval( integration_test=integration_test, ) -app.command(name="eval")(eval_command) - @app.command() diff --git a/hud/cli/validate.py b/hud/cli/validate.py index 8b50005c7..baad9dad5 100644 --- a/hud/cli/validate.py +++ b/hud/cli/validate.py @@ -9,8 +9,9 @@ import typer from pydantic import ValidationError +from hud.datasets import load_tasks +from hud.eval.utils import validate_v4_task from hud.types import Task -from hud.utils.tasks import load_tasks from hud.utils.hud_console import hud_console @@ -27,6 +28,8 @@ def validate_command(source: str) -> None: for idx, task in enumerate(raw_tasks): label = task.get("id") or f"index {idx}" try: + if _looks_like_v4(task): + validate_v4_task(task) Task(**_as_dict(task)) except ValidationError as e: errors.append(f"{label}: {e}") @@ -51,6 +54,13 @@ def _as_dict(task: Any) -> dict[str, Any]: return {} +def _looks_like_v4(task: dict[str, Any]) -> bool: + return any( + key in task + for key in ("prompt", "mcp_config", "evaluate_tool", "setup_tool", "integration_test_tool") + ) + + def _load_raw_tasks(source: str) -> tuple[list[dict[str, Any]], list[str]]: path = Path(source) if path.exists() and path.suffix.lower() in {".json", ".jsonl"}: From 7e5773888630bc5769bf77d41809f73c8ae3218d Mon Sep 17 00:00:00 2001 From: MagellaX Date: Mon, 26 Jan 2026 15:57:51 +0530 Subject: [PATCH 4/5] Restore eval command registration and tighten types --- hud/cli/__init__.py | 186 +------------------------------------------- hud/cli/validate.py | 4 +- 2 files changed, 4 insertions(+), 186 deletions(-) diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py index 0ba34b21e..f32cedc3e 100644 --- a/hud/cli/__init__.py +++ b/hud/cli/__init__.py @@ -21,6 +21,7 @@ from .debug import debug_mcp_stdio from .deploy import deploy_command from .dev import run_mcp_dev_server +from .eval import eval_command from .link import link_command from .pull import pull_command from .push import push_command @@ -981,7 +982,6 @@ def quickstart() -> None: clone("https://github.com/hud-evals/quickstart.git") - @app.command() def validate( source: str = typer.Argument( # type: ignore[arg-type] # noqa: B008 @@ -993,189 +993,7 @@ def validate( validate_command(source) -@app.command() -def eval( - source: str | None = typer.Argument( - None, - help=( - "HuggingFace dataset (e.g. 'hud-evals/SheetBench-50') or task JSON file. " - "If not provided, looks for task.json in current directory." - ), - ), - agent: str | None = typer.Argument( - None, - help=( - "Agent backend to use (claude, openai, vllm, or litellm). If not provided, will prompt interactively." # noqa: E501 - ), - ), - full: bool = typer.Option( - False, - "--full", - help="Run the entire dataset (omit for single-task debug mode)", - ), - model: str | None = typer.Option( - None, - "--model", - help="Model name for the chosen agent", - ), - allowed_tools: str | None = typer.Option( - None, - "--allowed-tools", - help="Comma-separated list of allowed tools", - ), - max_concurrent: int = typer.Option( - 30, - "--max-concurrent", - help="Maximum concurrent tasks (1-200 recommended, prevents rate limits)", - ), - max_steps: int | None = typer.Option( - None, - "--max-steps", - help="Maximum steps per task (default: 10 for single, 50 for full)", - ), - verbose: bool = typer.Option( - False, - "--verbose", - "-v", - help="Enable verbose output from the agent", - ), - very_verbose: bool = typer.Option( - False, - "--very-verbose", - "-vv", - help="Enable debug-level logs for maximum visibility", - ), - vllm_base_url: str | None = typer.Option( - None, - "--vllm-base-url", - help="Base URL for vLLM server (when using --agent vllm)", - ), - group_size: int = typer.Option( - 1, - "--group-size", - help="Number of times to run each task (similar to RL training)", - ), - integration_test: bool = typer.Option( - False, - "--integration-test", - help=( - "Run integration_test_tool, where problem is setup, " - "actions are applied, and evaluation is performed, without " - "spinning up an agent" - ), - ), -) -> None: - """🚀 Run evaluation on datasets or individual tasks with agents.""" - from hud.settings import settings - from hud.utils.hud_console import HUDConsole - - hud_console = HUDConsole() - - if integration_test: - agent = AgentType.INTEGRATION_TEST - - # If no source provided, reuse RL helper to find a tasks file interactively - if source is None: - try: - from hud.cli.utils.tasks import find_tasks_file - - source = find_tasks_file(None, msg="Select a tasks file to run") - hud_console.success(f"Selected: {source}") - except (FileNotFoundError, Exception): - hud_console.error( - "No source provided and no task/eval JSON files found in current directory" - ) - hud_console.info( - "Usage: hud eval or create a task JSON file (e.g., task.json, tasks.jsonl)" - ) - raise typer.Exit(1) from None - - # Import eval_command lazily to avoid importing agent dependencies - try: - from .eval import eval_command, get_available_models - except ImportError as e: - hud_console.error( - "Evaluation dependencies are not installed. " - "Please install with: pip install 'hud-python[agent]'" - ) - raise typer.Exit(1) from e - - # If no agent specified, fetch available models and prompt for selection - base_model = None - hud_model_base_map: dict[str, str] = {} - if agent is None: - # Get available HUD models first - hud_models = get_available_models() - - # Build choices starting with HUD models - choices = [] - - # Add HUD models as agent choices - for hud_model in hud_models: - model_name = hud_model["name"] - hud_model_base_map[model_name] = hud_model["base_model"] - vllm_status = " ⚡" if hud_model.get("vllm_url") else "" - choices.append({"name": f"{model_name}{vllm_status}", "value": f"{model_name}"}) - - # Add standard agent choices - choices.extend( - [ - {"name": "Claude 4 Sonnet", "value": AgentType.CLAUDE}, - {"name": "OpenAI Computer Use", "value": AgentType.OPENAI}, - {"name": "Gemini Computer Use", "value": AgentType.GEMINI}, - {"name": "vLLM (Local Server)", "value": AgentType.VLLM}, - {"name": "LiteLLM (Multi-provider)", "value": AgentType.LITELLM}, - ] - ) - - agent = hud_console.select("Select an agent to use:", choices=choices, default=0) - - # Handle HUD model selection - if agent and agent not in [e.value for e in AgentType]: - # Find remote model name - model = agent - if not vllm_base_url: - vllm_base_url = f"{settings.hud_rl_url}/models/{model}/vllm" - - # Set model to base model for the vllm endpoint - if not base_model: - if not hud_model_base_map: - hud_models = get_available_models() - for hud_model in hud_models: - hud_model_base_map[hud_model["name"]] = hud_model["base_model"] - base_model = hud_model_base_map.get(model) - if not base_model: - hud_console.error(f"Model {model} not found") - raise typer.Exit(1) - model = base_model - agent = AgentType.VLLM # Use vLLM backend for HUD models - hud_console.info(f"Using HUD model: {model} (trained on {base_model})") - - # Validate agent choice - valid_agents = [e.value for e in AgentType] - if agent not in valid_agents: - hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}") - raise typer.Exit(1) - - # Type narrowing: agent is now guaranteed to be an AgentType value after validation - agent = AgentType(agent) - - # Run the command - eval_command( - source=source, - full=full, - agent=agent, - model=model, - allowed_tools=allowed_tools, - max_concurrent=max_concurrent, - max_steps=max_steps, - verbose=verbose, - very_verbose=very_verbose, - vllm_base_url=vllm_base_url, - group_size=group_size, - integration_test=integration_test, - ) - +app.command(name="eval")(eval_command) @app.command() diff --git a/hud/cli/validate.py b/hud/cli/validate.py index baad9dad5..9ceeaa345 100644 --- a/hud/cli/validate.py +++ b/hud/cli/validate.py @@ -4,7 +4,7 @@ import json from pathlib import Path -from typing import Any +from typing import Any, cast import typer from pydantic import ValidationError @@ -65,7 +65,7 @@ def _load_raw_tasks(source: str) -> tuple[list[dict[str, Any]], list[str]]: path = Path(source) if path.exists() and path.suffix.lower() in {".json", ".jsonl"}: return _load_raw_from_file(path) - return load_tasks(source, raw=True), [] + return cast(list[dict[str, Any]], load_tasks(source, raw=True)), [] def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]: From 0191b4d1a8246f38c8b7b74ac05cc765a2c22f1d Mon Sep 17 00:00:00 2001 From: MagellaX Date: Mon, 26 Jan 2026 18:24:22 +0530 Subject: [PATCH 5/5] Fix ruff formatting and JSONL validation --- hud/cli/__init__.py | 7 ++----- hud/cli/validate.py | 11 ++++++++--- hud/tests/test_validate_cli.py | 6 +++++- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py index f32cedc3e..644baa647 100644 --- a/hud/cli/__init__.py +++ b/hud/cli/__init__.py @@ -26,15 +26,12 @@ from .pull import pull_command from .push import push_command from .remove import remove_command - -from .validate import validate_command - from .rft import rft_command from .rft_status import rft_status_command - from .utils.config import set_env_values from .utils.cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config from .utils.logging import CaptureLogger +from .validate import validate_command # Create the main Typer app app = typer.Typer( @@ -984,7 +981,7 @@ def quickstart() -> None: @app.command() def validate( - source: str = typer.Argument( # type: ignore[arg-type] # noqa: B008 + source: str = typer.Argument( # type: ignore[arg-type] ..., help="Tasks file path or dataset slug (e.g. ./tasks.json or hud-evals/SheetBench-50)", ), diff --git a/hud/cli/validate.py b/hud/cli/validate.py index 9ceeaa345..1c8f1d4b3 100644 --- a/hud/cli/validate.py +++ b/hud/cli/validate.py @@ -65,7 +65,7 @@ def _load_raw_tasks(source: str) -> tuple[list[dict[str, Any]], list[str]]: path = Path(source) if path.exists() and path.suffix.lower() in {".json", ".jsonl"}: return _load_raw_from_file(path) - return cast(list[dict[str, Any]], load_tasks(source, raw=True)), [] + return cast("list[dict[str, Any]]", load_tasks(source, raw=True)), [] def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]: @@ -78,7 +78,11 @@ def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]: line = line.strip() if not line: continue - value = json.loads(line) + try: + value = json.loads(line) + except json.JSONDecodeError as e: + errors.append(f"line {line_no}: invalid JSON ({e.msg})") + continue if isinstance(value, dict): items.append(value) continue @@ -87,8 +91,9 @@ def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]: if isinstance(entry, dict): items.append(entry) else: + entry_type = type(entry).__name__ errors.append( - f"line {line_no} item {idx}: expected object, got {type(entry).__name__}" + f"line {line_no} item {idx}: expected object, got {entry_type}" ) continue errors.append( diff --git a/hud/tests/test_validate_cli.py b/hud/tests/test_validate_cli.py index 52d10d335..0adddf774 100644 --- a/hud/tests/test_validate_cli.py +++ b/hud/tests/test_validate_cli.py @@ -46,7 +46,11 @@ def test_validate_command_invalid(tmp_path: Path) -> None: def test_validate_command_flags_non_dict_entries(tmp_path: Path) -> None: validate_command = _load_validate_command() tasks = [ - {"prompt": "ok", "mcp_config": {"local": {"command": "echo", "args": ["hi"]}}}, + { + "prompt": "ok", + "mcp_config": {"local": {"command": "echo", "args": ["hi"]}}, + "evaluate_tool": {"name": "done", "arguments": {}}, + }, "not a task", ] path = _write_tasks(tmp_path / "tasks.json", tasks)