diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py index ff3dcf44..644baa64 100644 --- a/hud/cli/__init__.py +++ b/hud/cli/__init__.py @@ -31,6 +31,7 @@ from .utils.config import set_env_values from .utils.cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config from .utils.logging import CaptureLogger +from .validate import validate_command # Create the main Typer app app = typer.Typer( @@ -978,6 +979,17 @@ def quickstart() -> None: clone("https://github.com/hud-evals/quickstart.git") +@app.command() +def validate( + source: str = typer.Argument( # type: ignore[arg-type] + ..., + help="Tasks file path or dataset slug (e.g. ./tasks.json or hud-evals/SheetBench-50)", + ), +) -> None: + """Validate task files or datasets without running them.""" + validate_command(source) + + app.command(name="eval")(eval_command) diff --git a/hud/cli/validate.py b/hud/cli/validate.py new file mode 100644 index 00000000..1c8f1d4b --- /dev/null +++ b/hud/cli/validate.py @@ -0,0 +1,117 @@ +"""Validate task files or datasets.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, cast + +import typer +from pydantic import ValidationError + +from hud.datasets import load_tasks +from hud.eval.utils import validate_v4_task +from hud.types import Task +from hud.utils.hud_console import hud_console + + +def validate_command(source: str) -> None: + """Validate tasks from a file or HuggingFace dataset.""" + try: + raw_tasks, type_errors = _load_raw_tasks(source) + except Exception as e: + hud_console.error(f"Failed to load tasks: {e}") + raise typer.Exit(1) from e + + errors: list[str] = [] + errors.extend(type_errors) + for idx, task in enumerate(raw_tasks): + label = task.get("id") or f"index {idx}" + try: + if _looks_like_v4(task): + validate_v4_task(task) + Task(**_as_dict(task)) + except ValidationError as e: + errors.append(f"{label}: {e}") + except Exception as e: + errors.append(f"{label}: {e}") + + if errors: + hud_console.error(f"Found {len(errors)} invalid task(s).") + for err in errors: + hud_console.error(f"- {err}") + raise typer.Exit(1) + + hud_console.success(f"Validated {len(raw_tasks)} task(s).") + + +def _as_dict(task: Any) -> dict[str, Any]: + if isinstance(task, dict): + return task + try: + return dict(task) + except Exception: + return {} + + +def _looks_like_v4(task: dict[str, Any]) -> bool: + return any( + key in task + for key in ("prompt", "mcp_config", "evaluate_tool", "setup_tool", "integration_test_tool") + ) + + +def _load_raw_tasks(source: str) -> tuple[list[dict[str, Any]], list[str]]: + path = Path(source) + if path.exists() and path.suffix.lower() in {".json", ".jsonl"}: + return _load_raw_from_file(path) + return cast("list[dict[str, Any]]", load_tasks(source, raw=True)), [] + + +def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]: + errors: list[str] = [] + items: list[dict[str, Any]] = [] + + if path.suffix.lower() == ".jsonl": + with open(path, encoding="utf-8") as f: + for line_no, line in enumerate(f, start=1): + line = line.strip() + if not line: + continue + try: + value = json.loads(line) + except json.JSONDecodeError as e: + errors.append(f"line {line_no}: invalid JSON ({e.msg})") + continue + if isinstance(value, dict): + items.append(value) + continue + if isinstance(value, list): + for idx, entry in enumerate(value): + if isinstance(entry, dict): + items.append(entry) + else: + entry_type = type(entry).__name__ + errors.append( + f"line {line_no} item {idx}: expected object, got {entry_type}" + ) + continue + errors.append( + f"line {line_no}: expected object or list, got {type(value).__name__}" + ) + return items, errors + + with open(path, encoding="utf-8") as f: + value = json.load(f) + + if isinstance(value, dict): + return [value], errors + if isinstance(value, list): + for idx, entry in enumerate(value): + if isinstance(entry, dict): + items.append(entry) + else: + errors.append(f"index {idx}: expected object, got {type(entry).__name__}") + return items, errors + + raise ValueError(f"JSON file must contain an object or array, got {type(value).__name__}") diff --git a/hud/tests/test_validate_cli.py b/hud/tests/test_validate_cli.py new file mode 100644 index 00000000..0adddf77 --- /dev/null +++ b/hud/tests/test_validate_cli.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import importlib.util +import json +from pathlib import Path + +import pytest +import typer + + +def _load_validate_command(): + module_path = Path(__file__).resolve().parents[1] / "cli" / "validate.py" + spec = importlib.util.spec_from_file_location("hud.cli.validate", module_path) + module = importlib.util.module_from_spec(spec) # type: ignore[arg-type] + assert spec and spec.loader + spec.loader.exec_module(module) + return module.validate_command + + +def _write_tasks(path: Path, tasks: list[dict]) -> str: + path.write_text(json.dumps(tasks), encoding="utf-8") + return str(path) + + +def test_validate_command_valid(tmp_path: Path) -> None: + validate_command = _load_validate_command() + tasks = [ + { + "prompt": "Say hello", + "mcp_config": {"local": {"command": "echo", "args": ["hi"]}}, + "evaluate_tool": {"name": "done", "arguments": {}}, + } + ] + path = _write_tasks(tmp_path / "tasks.json", tasks) + validate_command(path) + + +def test_validate_command_invalid(tmp_path: Path) -> None: + validate_command = _load_validate_command() + tasks = [{"mcp_config": {"local": {"command": "echo", "args": ["hi"]}}}] + path = _write_tasks(tmp_path / "tasks.json", tasks) + with pytest.raises(typer.Exit): + validate_command(path) + + +def test_validate_command_flags_non_dict_entries(tmp_path: Path) -> None: + validate_command = _load_validate_command() + tasks = [ + { + "prompt": "ok", + "mcp_config": {"local": {"command": "echo", "args": ["hi"]}}, + "evaluate_tool": {"name": "done", "arguments": {}}, + }, + "not a task", + ] + path = _write_tasks(tmp_path / "tasks.json", tasks) + with pytest.raises(typer.Exit): + validate_command(path)