From e7d3d517578df09cd972ba147e2d7d04709bef1e Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 27 Feb 2026 11:15:24 -0800 Subject: [PATCH 01/14] j/fix-job-linking --- hud/cli/eval.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index d705f01d..1c251858 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -622,6 +622,10 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: hud_console.error(f"No tasks found in: {cfg.source}") raise typer.Exit(1) + # Default taskset to source when loading from API (not a local file) + if cfg.taskset is None and not Path(cfg.source).exists(): + cfg.taskset = cfg.source + # Filter by task slugs (or positional indices) if provided if cfg.task_ids: selector_set = set(cfg.task_ids) From c596dc135e60be6dc74bb4b4122de2acc9383445 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 27 Feb 2026 11:36:07 -0800 Subject: [PATCH 02/14] use taskset_id for API-loaded tasks --- hud/cli/eval.py | 9 ++++----- hud/datasets/loader.py | 26 ++++++++++++++++++-------- hud/datasets/runner.py | 2 ++ hud/eval/manager.py | 14 +++++++------- hud/eval/types.py | 2 +- 5 files changed, 32 insertions(+), 21 deletions(-) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 1c251858..5ae8805a 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -622,9 +622,8 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: hud_console.error(f"No tasks found in: {cfg.source}") raise typer.Exit(1) - # Default taskset to source when loading from API (not a local file) - if cfg.taskset is None and not Path(cfg.source).exists(): - cfg.taskset = cfg.source + # Extract taskset_id from API-loaded tasks (set by loader in metadata) + taskset_id: str | None = tasks[0].metadata.get("taskset_id") if tasks else None # Filter by task slugs (or positional indices) if provided if cfg.task_ids: @@ -695,7 +694,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: variants=None, group=cfg.group_size, api_key=None, - taskset=cfg.taskset, + taskset_id=taskset_id, hud_eval_config=eval_cfg_dict, ) @@ -738,7 +737,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: max_concurrent=cfg.max_concurrent, group_size=cfg.group_size, quiet=cfg.quiet, - taskset=cfg.taskset, + taskset_id=taskset_id, ) # Show reward for single task diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py index 848e868c..48b6d8b0 100644 --- a/hud/datasets/loader.py +++ b/hud/datasets/loader.py @@ -108,8 +108,11 @@ def _load_from_huggingface(dataset_name: str) -> list[Task]: return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items] -def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]: - """Load raw task dicts from HUD API.""" +def _load_raw_from_api(dataset_name: str) -> tuple[list[dict[str, Any]], str | None]: + """Load raw task dicts from HUD API. + + Returns (tasks, taskset_id) tuple. + """ from hud.datasets.utils import _normalize_task_dict headers = {} @@ -125,23 +128,27 @@ def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]: response.raise_for_status() data = response.json() - # Extract tasks dict from response + taskset_id = data.get("evalset_id") tasks_dict = data.get("tasks", {}) - return [ + tasks = [ _normalize_task_dict(task_data) for task_data in tasks_dict.values() if isinstance(task_data, dict) ] + return tasks, taskset_id def _load_from_api(dataset_name: str) -> list[Task]: """Load tasks from HUD API.""" from hud.eval.task import Task - raw_items = _load_raw_from_api(dataset_name) - # Default args to {} for runnable tasks (None = template) - return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items] + raw_items, taskset_id = _load_raw_from_api(dataset_name) + tasks = [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items] + if taskset_id: + for task in tasks: + task.metadata["taskset_id"] = taskset_id + return tasks @overload @@ -210,7 +217,10 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, # Try HUD API first try: logger.info("Trying HUD API: %s", source) - items = _load_raw_from_api(source) if raw else _load_from_api(source) + if raw: + items, _ = _load_raw_from_api(source) + else: + items = _load_from_api(source) logger.info("Loaded %d tasks from HUD API: %s", len(items), source) return items except Exception as hud_error: diff --git a/hud/datasets/runner.py b/hud/datasets/runner.py index d4d9a586..a34c0a76 100644 --- a/hud/datasets/runner.py +++ b/hud/datasets/runner.py @@ -30,6 +30,7 @@ async def run_dataset( group_size: int = 1, quiet: bool = True, taskset: str | None = None, + taskset_id: str | None = None, ) -> list[EvalContext]: """Run an agent on a dataset of tasks. @@ -98,6 +99,7 @@ async def run_dataset( max_concurrent=max_concurrent, quiet=quiet, taskset=taskset, + taskset_id=taskset_id, ) as ctx: # Build agent params - use system_prompt from ctx (set from task.agent_config) final_agent_params = dict(agent_params or {}) diff --git a/hud/eval/manager.py b/hud/eval/manager.py index 19481232..4b3ffcd9 100644 --- a/hud/eval/manager.py +++ b/hud/eval/manager.py @@ -71,13 +71,12 @@ async def _send_job_enter( variants: dict[str, Any] | None, group: int, api_key: str | None, - taskset: str | None = None, + taskset_id: str | None = None, hud_eval_config: dict[str, Any] | None = None, ) -> None: """Send job enter payload (async request before traces start). - Registers the job with the platform. Tasks must already exist in the - taskset. + Registers the job with the platform. """ import httpx @@ -92,7 +91,7 @@ async def _send_job_enter( name=name, variants=variants, group=group, - taskset=taskset, + taskset_id=taskset_id, hud_eval_config=hud_eval_config, ) @@ -122,6 +121,7 @@ async def run_eval( trace: bool = True, quiet: bool = False, taskset: str | None = None, + taskset_id: str | None = None, ) -> AsyncGenerator[EvalContext, None]: """Standalone eval context manager. @@ -253,7 +253,7 @@ async def run_eval( if total_evals == 1: if tasks: job_id_for_run = job_id - if taskset: + if taskset or taskset_id: eval_name = _get_eval_name(tasks=tasks, group=group) if job_id_for_run is None: job_id_for_run = str(uuid.uuid4()) @@ -264,7 +264,7 @@ async def run_eval( variants=variants, group=group, api_key=api_key, - taskset=taskset, + taskset_id=taskset_id, ) # Single task - use EvalContext.from_task() @@ -311,7 +311,7 @@ async def run_eval( variants=variants, group=group, api_key=api_key, - taskset=taskset, + taskset_id=taskset_id, ) # Print job URL (not individual trace URLs) diff --git a/hud/eval/types.py b/hud/eval/types.py index ecc307ec..1d43926e 100644 --- a/hud/eval/types.py +++ b/hud/eval/types.py @@ -54,7 +54,7 @@ class JobEnterPayload(BaseModel): name: str | None = None variants: dict[str, Any] | None = None # Full variant config group: int | None = None - taskset: str | None = None # taskset slug to associate job with + taskset_id: str | None = None # evalset UUID to associate job with hud_eval_config: dict[str, Any] | None = None # replayable hud eval config (no secrets) From 14855219cfeba3c0a4b3854a91b9eb5fb197ba93 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 27 Feb 2026 11:36:12 -0800 Subject: [PATCH 03/14] remove hf --- hud/cli/eval.py | 2 +- hud/datasets/__init__.py | 2 +- hud/datasets/loader.py | 95 ++++------------------------------------ pyproject.toml | 2 - 4 files changed, 10 insertions(+), 91 deletions(-) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 5ae8805a..459eeabc 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -753,7 +753,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: def eval_command( - source: str | None = typer.Argument(None, help="HuggingFace dataset or task JSON file"), + source: str | None = typer.Argument(None, help="Taskset slug or task JSON file"), agent: str | None = typer.Argument( None, help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test", # noqa: E501 diff --git a/hud/datasets/__init__.py b/hud/datasets/__init__.py index 6bf88851..8d4cebfc 100644 --- a/hud/datasets/__init__.py +++ b/hud/datasets/__init__.py @@ -3,7 +3,7 @@ Provides unified task loading, saving, and execution for HUD evaluations. Key functions: -- load_tasks(): Load tasks from JSON, JSONL, HuggingFace, or HUD API +- load_tasks(): Load tasks from JSON, JSONL, or HUD API - save_tasks(): Save tasks to the HUD API - run_dataset(): Run an agent on a dataset of tasks - submit_rollouts(): Submit tasks for remote execution diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py index 48b6d8b0..e7be7bbb 100644 --- a/hud/datasets/loader.py +++ b/hud/datasets/loader.py @@ -3,7 +3,6 @@ Unified interface for loading evaluation tasks from: - HUD API (v5 format) - Local JSON/JSONL files (v4 LegacyTask format, auto-converted) -- HuggingFace datasets (v4 LegacyTask format, auto-converted) """ from __future__ import annotations @@ -71,43 +70,6 @@ def _load_from_file(path: Path) -> list[Task]: return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items] -def _load_raw_from_huggingface(dataset_name: str) -> list[dict[str, Any]]: - """Load raw task dicts from HuggingFace dataset.""" - try: - from datasets import load_dataset as hf_load_dataset - except ImportError as e: - raise ImportError( - "Please install 'datasets' to load from HuggingFace: uv pip install datasets" - ) from e - - # Parse dataset name and optional split - if ":" in dataset_name: - name, split = dataset_name.split(":", 1) - else: - name = dataset_name - split = "train" # Default split - - logger.info("Loading from HuggingFace dataset: %s (split=%s)", name, split) - dataset = hf_load_dataset(name, split=split) - - raw_items: list[dict[str, Any]] = [] - for item in dataset: - if not isinstance(item, dict): - raise ValueError(f"Invalid HuggingFace dataset: expected dict, got {type(item)}") - raw_items.append(dict(item)) - - return raw_items - - -def _load_from_huggingface(dataset_name: str) -> list[Task]: - """Load tasks from HuggingFace dataset.""" - raw_items = _load_raw_from_huggingface(dataset_name) - from hud.eval.task import Task - - # Default args to {} for runnable tasks (None = template) - return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items] - - def _load_raw_from_api(dataset_name: str) -> tuple[list[dict[str, Any]], str | None]: """Load raw task dicts from HUD API. @@ -165,7 +127,6 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, Supports multiple sources with auto-detection: - Local file path (JSON or JSONL) - HUD API dataset slug (e.g., "hud-evals/SheetBench-50") - - HuggingFace dataset (e.g., "username/dataset" or "username/dataset:split") Automatically detects and converts v4 LegacyTask format to v5 Task. @@ -173,7 +134,6 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, source: Task source. Can be: - Path to a local JSON/JSONL file - HUD API dataset slug (e.g., "hud-evals/SheetBench-50") - - HuggingFace dataset name (e.g., "hud-evals/tasks" or "hud-evals/tasks:train") raw: If True, return raw dicts without validation or env var substitution. Useful for preserving template strings like "${HUD_API_KEY}". @@ -181,28 +141,6 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, - If raw=False (default): list[Task] ready to use with hud.eval() - If raw=True: list[dict] with raw task data - Example: - ```python - import hud - from hud.datasets import load_tasks - - # Load from HUD API - tasks = load_tasks("hud-evals/SheetBench-50") - - # Load from local file (v4 format auto-converted) - tasks = load_tasks("./my-tasks.json") - - # Load from HuggingFace - tasks = load_tasks("hud-evals/benchmark:test") - - # Load raw dicts (preserves env var placeholders) - raw_tasks = load_tasks("./tasks.json", raw=True) - - # Run evaluation - async with hud.eval(tasks) as ctx: - await agent.run(ctx) - ``` - Raises: ValueError: If task loading fails """ @@ -214,31 +152,14 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, logger.info("Loaded %d tasks from %s", len(items), source) return items - # Try HUD API first - try: - logger.info("Trying HUD API: %s", source) - if raw: - items, _ = _load_raw_from_api(source) - else: - items = _load_from_api(source) - logger.info("Loaded %d tasks from HUD API: %s", len(items), source) - return items - except Exception as hud_error: - logger.debug("HUD API load failed (%s), trying HuggingFace", hud_error) - - # Try HuggingFace as fallback - try: - logger.info("Trying HuggingFace dataset: %s", source) - items = _load_raw_from_huggingface(source) if raw else _load_from_huggingface(source) - logger.info("Loaded %d tasks from HuggingFace: %s", len(items), source) - return items - except ImportError: - raise ValueError( - f"Failed to load tasks from '{source}'. " - "Install 'datasets' package for HuggingFace support." - ) from None - except Exception as hf_error: - raise ValueError(f"Failed to load tasks from '{source}': {hf_error}") from hf_error + # Try HUD API + logger.info("Trying HUD API: %s", source) + if raw: + items, _ = _load_raw_from_api(source) + else: + items = _load_from_api(source) + logger.info("Loaded %d tasks from HUD API: %s", len(items), source) + return items def save_tasks( diff --git a/pyproject.toml b/pyproject.toml index b38d291f..910ae2ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,8 +113,6 @@ agents = [ "anthropic>=0.78.0", "google-genai", "openai-agents", - # Dataset loading (HuggingFace) - "datasets>=2.14.0", # Image processing for screenshots/grounding "pillow>=11.1.0", # Jupyter kernel support From b853aed8b1f8e7762b446a7d8b308e5d7c5bb286 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 27 Feb 2026 12:10:11 -0800 Subject: [PATCH 04/14] simplify --- hud/cli/eval.py | 39 +++++++++++++++++++++++++++++---------- hud/datasets/loader.py | 14 +++++++------- hud/datasets/runner.py | 9 +++++---- hud/eval/manager.py | 39 +++++++++++---------------------------- 4 files changed, 52 insertions(+), 49 deletions(-) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 459eeabc..0d3ee4ee 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -609,22 +609,27 @@ def display(self) -> None: async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: """Run evaluation with the given config using run_dataset().""" - from hud.datasets import load_tasks, run_dataset + from pathlib import Path + + from hud.datasets import run_dataset + from hud.datasets.loader import _load_from_api, _load_from_file if cfg.source is None or cfg.agent_type is None: raise ValueError("source and agent_type must be set") - # Load tasks using unified loader (handles v4→v5 conversion automatically) + # Load tasks — use internal loaders to capture taskset_id from API sources hud_console.info(f"📊 Loading tasks from: {cfg.source}…") - tasks = load_tasks(cfg.source) + path = Path(cfg.source) + taskset_id: str | None = None + if path.exists() and path.suffix in {".json", ".jsonl"}: + tasks = _load_from_file(path) + else: + tasks, taskset_id = _load_from_api(cfg.source) if not tasks: hud_console.error(f"No tasks found in: {cfg.source}") raise typer.Exit(1) - # Extract taskset_id from API-loaded tasks (set by loader in metadata) - taskset_id: str | None = tasks[0].metadata.get("taskset_id") if tasks else None - # Filter by task slugs (or positional indices) if provided if cfg.task_ids: selector_set = set(cfg.task_ids) @@ -653,15 +658,16 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: max_steps = cfg.max_steps + import uuid + + from hud.eval.manager import _send_job_enter + # Remote execution - submit to HUD platform if cfg.remote: agent_kwargs = { k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client") } - import uuid - from hud.datasets.utils import submit_rollouts - from hud.eval.manager import _send_job_enter job_id = str(uuid.uuid4()) hud_console.info( @@ -728,6 +734,19 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: f"group_size: {cfg.group_size})…" ) + # Register job with taskset association if tasks came from API + job_id: str | None = None + if taskset_id: + job_id = str(uuid.uuid4()) + await _send_job_enter( + job_id=job_id, + name=f"eval ({cfg.source})" if cfg.source else "eval", + variants=None, + group=cfg.group_size, + api_key=None, + taskset_id=taskset_id, + ) + # Run using run_dataset results = await run_dataset( tasks, @@ -737,7 +756,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: max_concurrent=cfg.max_concurrent, group_size=cfg.group_size, quiet=cfg.quiet, - taskset_id=taskset_id, + job_id=job_id, ) # Show reward for single task diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py index e7be7bbb..ebc0adde 100644 --- a/hud/datasets/loader.py +++ b/hud/datasets/loader.py @@ -101,16 +101,16 @@ def _load_raw_from_api(dataset_name: str) -> tuple[list[dict[str, Any]], str | N return tasks, taskset_id -def _load_from_api(dataset_name: str) -> list[Task]: - """Load tasks from HUD API.""" +def _load_from_api(dataset_name: str) -> tuple[list[Task], str | None]: + """Load tasks from HUD API. + + Returns (tasks, taskset_id) tuple. + """ from hud.eval.task import Task raw_items, taskset_id = _load_raw_from_api(dataset_name) tasks = [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items] - if taskset_id: - for task in tasks: - task.metadata["taskset_id"] = taskset_id - return tasks + return tasks, taskset_id @overload @@ -157,7 +157,7 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, if raw: items, _ = _load_raw_from_api(source) else: - items = _load_from_api(source) + items, _ = _load_from_api(source) logger.info("Loaded %d tasks from HUD API: %s", len(items), source) return items diff --git a/hud/datasets/runner.py b/hud/datasets/runner.py index a34c0a76..d0244636 100644 --- a/hud/datasets/runner.py +++ b/hud/datasets/runner.py @@ -29,8 +29,7 @@ async def run_dataset( max_concurrent: int = 30, group_size: int = 1, quiet: bool = True, - taskset: str | None = None, - taskset_id: str | None = None, + job_id: str | None = None, ) -> list[EvalContext]: """Run an agent on a dataset of tasks. @@ -48,6 +47,9 @@ async def run_dataset( max_concurrent: Maximum concurrent tasks (for parallel execution). group_size: Number of times to run each task (for variance estimation). quiet: Whether to suppress printing eval links and opening browser (default True). + job_id: Pre-registered job ID. If provided, traces are grouped under this job + and no implicit job is created. If None, a job is created automatically + for parallel execution. Returns: List of EvalContext results from each task execution. Access `.reward` on each. @@ -98,8 +100,7 @@ async def run_dataset( group=group_size, max_concurrent=max_concurrent, quiet=quiet, - taskset=taskset, - taskset_id=taskset_id, + job_id=job_id, ) as ctx: # Build agent params - use system_prompt from ctx (set from task.agent_config) final_agent_params = dict(agent_params or {}) diff --git a/hud/eval/manager.py b/hud/eval/manager.py index 4b3ffcd9..0908738e 100644 --- a/hud/eval/manager.py +++ b/hud/eval/manager.py @@ -120,8 +120,6 @@ async def run_eval( max_concurrent: int | None = None, trace: bool = True, quiet: bool = False, - taskset: str | None = None, - taskset_id: str | None = None, ) -> AsyncGenerator[EvalContext, None]: """Standalone eval context manager. @@ -139,7 +137,7 @@ async def run_eval( variants: A/B test configuration (dict with list values expanded) group: Runs per variant for statistical significance group_ids: Optional list of group IDs - job_id: Job ID to link to + job_id: Pre-registered job ID. Skips implicit job creation if provided. group_id: Group ID for parallel evaluations trace_id: Pre-assigned trace ID (auto-generated if not provided) api_key: API key for backend calls @@ -252,28 +250,13 @@ async def run_eval( if total_evals == 1: if tasks: - job_id_for_run = job_id - if taskset or taskset_id: - eval_name = _get_eval_name(tasks=tasks, group=group) - if job_id_for_run is None: - job_id_for_run = str(uuid.uuid4()) - - await _send_job_enter( - job_id=job_id_for_run, - name=eval_name, - variants=variants, - group=group, - api_key=api_key, - taskset_id=taskset_id, - ) - # Single task - use EvalContext.from_task() ctx = EvalContext.from_task( tasks[0], name=name, trace_id=trace_id, api_key=api_key, - job_id=job_id_for_run, + job_id=job_id, group_id=group_id, variants=variant_combos[0], code_snippet=code_snippet, @@ -304,15 +287,15 @@ async def run_eval( implicit_job_id = job_id or str(uuid.uuid4()) job_url = f"https://hud.ai/jobs/{implicit_job_id}" - # Send job enter (sync request before traces start) - await _send_job_enter( - job_id=implicit_job_id, - name=eval_name, - variants=variants, - group=group, - api_key=api_key, - taskset_id=taskset_id, - ) + # Register job if not already provided by caller + if not job_id: + await _send_job_enter( + job_id=implicit_job_id, + name=eval_name, + variants=variants, + group=group, + api_key=api_key, + ) # Print job URL (not individual trace URLs) if not quiet: From 97426adf7a1efbb368c234567920260fdbee7735 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 27 Feb 2026 17:11:43 -0800 Subject: [PATCH 05/14] align eval naming --- hud/cli/eval.py | 19 +++---------------- hud/datasets/runner.py | 3 +++ hud/eval/manager.py | 3 +++ 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 0d3ee4ee..bb386245 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -660,7 +660,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: import uuid - from hud.eval.manager import _send_job_enter + from hud.eval.manager import _get_eval_name, _send_job_enter # Remote execution - submit to HUD platform if cfg.remote: @@ -696,7 +696,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: await _send_job_enter( job_id=job_id, - name=f"eval ({cfg.source})" if cfg.source else "eval", + name=_get_eval_name(tasks=tasks, group=cfg.group_size), variants=None, group=cfg.group_size, api_key=None, @@ -734,19 +734,6 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: f"group_size: {cfg.group_size})…" ) - # Register job with taskset association if tasks came from API - job_id: str | None = None - if taskset_id: - job_id = str(uuid.uuid4()) - await _send_job_enter( - job_id=job_id, - name=f"eval ({cfg.source})" if cfg.source else "eval", - variants=None, - group=cfg.group_size, - api_key=None, - taskset_id=taskset_id, - ) - # Run using run_dataset results = await run_dataset( tasks, @@ -756,7 +743,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: max_concurrent=cfg.max_concurrent, group_size=cfg.group_size, quiet=cfg.quiet, - job_id=job_id, + taskset_id=taskset_id, ) # Show reward for single task diff --git a/hud/datasets/runner.py b/hud/datasets/runner.py index d0244636..f6d21b60 100644 --- a/hud/datasets/runner.py +++ b/hud/datasets/runner.py @@ -30,6 +30,7 @@ async def run_dataset( group_size: int = 1, quiet: bool = True, job_id: str | None = None, + taskset_id: str | None = None, ) -> list[EvalContext]: """Run an agent on a dataset of tasks. @@ -50,6 +51,7 @@ async def run_dataset( job_id: Pre-registered job ID. If provided, traces are grouped under this job and no implicit job is created. If None, a job is created automatically for parallel execution. + taskset_id: Taskset UUID to associate the job with on the platform. Returns: List of EvalContext results from each task execution. Access `.reward` on each. @@ -101,6 +103,7 @@ async def run_dataset( max_concurrent=max_concurrent, quiet=quiet, job_id=job_id, + taskset_id=taskset_id, ) as ctx: # Build agent params - use system_prompt from ctx (set from task.agent_config) final_agent_params = dict(agent_params or {}) diff --git a/hud/eval/manager.py b/hud/eval/manager.py index 0908738e..5e535520 100644 --- a/hud/eval/manager.py +++ b/hud/eval/manager.py @@ -118,6 +118,7 @@ async def run_eval( trace_id: str | None = None, api_key: str | None = None, max_concurrent: int | None = None, + taskset_id: str | None = None, trace: bool = True, quiet: bool = False, ) -> AsyncGenerator[EvalContext, None]: @@ -142,6 +143,7 @@ async def run_eval( trace_id: Pre-assigned trace ID (auto-generated if not provided) api_key: API key for backend calls max_concurrent: Maximum concurrent evals (None = unlimited) + taskset_id: Taskset UUID to associate the job with on the platform. trace: Whether to send trace data to backend (default True) quiet: Whether to suppress printing links (default False) @@ -295,6 +297,7 @@ async def run_eval( variants=variants, group=group, api_key=api_key, + taskset_id=taskset_id, ) # Print job URL (not individual trace URLs) From 3d9daeec72cb1a6bfe00c57e49ca6be224a963f2 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 27 Feb 2026 17:22:01 -0800 Subject: [PATCH 06/14] align eval names --- hud/eval/manager.py | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/hud/eval/manager.py b/hud/eval/manager.py index 5e535520..f2d793bc 100644 --- a/hud/eval/manager.py +++ b/hud/eval/manager.py @@ -31,38 +31,26 @@ def _get_eval_name(tasks: list[Task] | None = None, group: int = 1) -> str: - """Extract a nice name for job display. + """Build a job display name. - Args: - tasks: List of Task objects - group: Group size (runs per task) - - Returns: - Name like "scenario (group=5)" for single task or "eval (50 tasks)" for batch + Convention: + 1 task, group=1: "Task Run: {scenario}" + 1 task, group>1: "Task Run: {scenario} (4 times)" + N tasks, group=1: "Batch Run: N tasks" + N tasks, group>1: "Batch Run: N tasks (4 times)" """ + suffix = f" ({group} times)" if group > 1 else "" + if not tasks: - return "eval" + return f"Task Run: eval{suffix}" - # Single task: use scenario/env name if len(tasks) == 1: - name = None - if tasks[0].scenario: - name = tasks[0].scenario - elif tasks[0].env and hasattr(tasks[0].env, "name"): + name = tasks[0].scenario + if not name and tasks[0].env and hasattr(tasks[0].env, "name"): name = tasks[0].env.name + return f"Task Run: {name or 'eval'}{suffix}" - if name: - if group > 1: - return f"{name} (group={group})" - return name - return "eval" - - # Batch: use generic name with count - parts = [f"{len(tasks)} tasks"] - if group > 1: - parts.append(f"group={group}") - - return f"eval ({', '.join(parts)})" + return f"Batch Run: {len(tasks)} tasks{suffix}" async def _send_job_enter( From 7054a26444f30e476a74f886e7d5c93c09e119c7 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 27 Feb 2026 18:48:05 -0800 Subject: [PATCH 07/14] update tests --- hud/datasets/tests/test_loader.py | 32 ++++++++++++++++------------- hud/tests/test_datasets_extended.py | 3 ++- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/hud/datasets/tests/test_loader.py b/hud/datasets/tests/test_loader.py index 384e3c6e..5c8a92ea 100644 --- a/hud/datasets/tests/test_loader.py +++ b/hud/datasets/tests/test_loader.py @@ -137,45 +137,49 @@ def test_load_tasks_no_api_key( @patch("hud.datasets.loader.httpx.Client") @patch("hud.datasets.loader.settings") - def test_load_tasks_http_error( + def test_load_tasks_taskset_not_found( self, mock_settings: MagicMock, mock_client_class: MagicMock ) -> None: - """load_tasks() raises ValueError on HTTP error.""" + """load_tasks() raises HTTPStatusError when taskset doesn't exist.""" import httpx mock_settings.hud_api_url = "https://api.hud.ai" mock_settings.api_key = "test_key" + mock_response = MagicMock() + mock_response.status_code = 404 + mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( + "Not Found", request=MagicMock(), response=mock_response + ) + mock_client = MagicMock() - mock_client.get.side_effect = httpx.HTTPError("Network error") + mock_client.get.return_value = mock_response mock_client.__enter__.return_value = mock_client mock_client.__exit__.return_value = None mock_client_class.return_value = mock_client - with pytest.raises(ValueError, match="Failed to load tasks"): - load_tasks("test-org/test-dataset") + with pytest.raises(httpx.HTTPStatusError): + load_tasks("nonexistent-taskset") @patch("hud.datasets.loader.httpx.Client") @patch("hud.datasets.loader.settings") - def test_load_tasks_json_error( + def test_load_tasks_network_error( self, mock_settings: MagicMock, mock_client_class: MagicMock ) -> None: - """load_tasks() raises ValueError on JSON processing error.""" + """load_tasks() raises ConnectError when API is unreachable.""" + import httpx + mock_settings.hud_api_url = "https://api.hud.ai" mock_settings.api_key = "test_key" - mock_response = MagicMock() - mock_response.json.side_effect = Exception("Invalid JSON") - mock_response.raise_for_status = MagicMock() - mock_client = MagicMock() - mock_client.get.return_value = mock_response + mock_client.get.side_effect = httpx.ConnectError("Connection refused") mock_client.__enter__.return_value = mock_client mock_client.__exit__.return_value = None mock_client_class.return_value = mock_client - with pytest.raises(ValueError, match="Failed to load tasks"): - load_tasks("test-org/test-dataset") + with pytest.raises(httpx.ConnectError): + load_tasks("my-taskset") @patch("hud.datasets.loader.httpx.Client") @patch("hud.datasets.loader.settings") diff --git a/hud/tests/test_datasets_extended.py b/hud/tests/test_datasets_extended.py index 3a870aaa..67b23a8c 100644 --- a/hud/tests/test_datasets_extended.py +++ b/hud/tests/test_datasets_extended.py @@ -237,5 +237,6 @@ async def test_run_dataset_passes_parameters(self): group=3, max_concurrent=10, quiet=True, - taskset=None, + job_id=None, + taskset_id=None, ) From b296c43bdaa1d796e9ea8e877680d17c5046e40a Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 27 Feb 2026 22:17:18 -0800 Subject: [PATCH 08/14] validation for subscores --- hud/tools/tests/test_types.py | 77 +++++++++++++++++++++++++++++++++++ hud/tools/types.py | 32 +++++++++++++-- 2 files changed, 106 insertions(+), 3 deletions(-) diff --git a/hud/tools/tests/test_types.py b/hud/tools/tests/test_types.py index daba05a4..157be404 100644 --- a/hud/tools/tests/test_types.py +++ b/hud/tools/tests/test_types.py @@ -437,3 +437,80 @@ def test_evaluation_result_isError_flag(): assert result.isError is True assert result.reward == 0.0 + + +# Tests for SubScore and EvaluationResult validators + + +def test_subscore_value_range_rejected(): + """Test SubScore rejects values outside [0, 1].""" + from pydantic import ValidationError + + with pytest.raises(ValidationError): + SubScore(name="test", value=-0.1) + with pytest.raises(ValidationError): + SubScore(name="test", value=1.5) + + +def test_check_subscores_duplicate_names_warns(): + """Test duplicate subscore names produce a warning.""" + import warnings + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + EvaluationResult( + reward=0.5, + subscores=[ + SubScore(name="accuracy", weight=0.5, value=0.5), + SubScore(name="accuracy", weight=0.5, value=0.5), + ], + ) + assert any("Duplicate subscore names" in str(x.message) for x in w) + + +def test_check_subscores_weights_not_summing_to_one_warns(): + """Test positive weights not summing to ~1.0 produce a warning.""" + import warnings + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + EvaluationResult( + reward=0.75, + subscores=[ + SubScore(name="a", weight=0.5, value=1.0), + SubScore(name="b", weight=0.25, value=1.0), + ], + ) + assert any("Positive subscore weights should sum to ~1.0" in str(x.message) for x in w) + + +def test_check_subscores_reward_mismatch_warns(): + """Test weighted sum not matching reward produces a warning.""" + import warnings + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + EvaluationResult( + reward=0.5, + subscores=[SubScore(name="a", weight=1.0, value=0.8)], + ) + assert any("Subscores don't match reward" in str(x.message) for x in w) + + +def test_check_subscores_valid_with_negative_weights(): + """Test valid subscores with negative weights produce no warnings.""" + import warnings + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + # Positive: 0.6 + 0.4 = 1.0 + # Weighted sum: 0.6*1.0 + 0.4*0.5 + (-0.2)*1.0 = 0.6 + EvaluationResult( + reward=0.6, + subscores=[ + SubScore(name="quality", weight=0.6, value=1.0), + SubScore(name="speed", weight=0.4, value=0.5), + SubScore(name="penalty", weight=-0.2, value=1.0), + ], + ) + assert len(w) == 0 diff --git a/hud/tools/types.py b/hud/tools/types.py index d4d0aa16..83c741bc 100644 --- a/hud/tools/types.py +++ b/hud/tools/types.py @@ -3,7 +3,9 @@ from typing import Any from mcp.types import ContentBlock, ImageContent, TextContent -from pydantic import BaseModel, ConfigDict, Field +import warnings + +from pydantic import BaseModel, ConfigDict, Field, model_validator class Coordinate(BaseModel): @@ -36,8 +38,8 @@ class SubScore(BaseModel): model_config = ConfigDict(extra="forbid") name: str = Field(..., description="Name of this subscore component") - weight: float = Field(default=1.0, description="Weight of this subscore (for weighted average)") - value: float = Field(..., description="Value of this subscore, usually 0.0 to 1.0") + weight: float = Field(default=1.0, description="Weight of this subscore (for weighted average). Negative weights represent penalties.") + value: float = Field(..., ge=0.0, le=1.0, description="Value of this subscore, 0.0 to 1.0") metadata: dict[str, Any] | None = Field(default=None, exclude=True) @property @@ -76,6 +78,30 @@ class EvaluationResult(BaseModel): model_config = ConfigDict(extra="allow") + @model_validator(mode="after") + def _check_subscores(self) -> EvaluationResult: + if not self.subscores: + return self + names = [s.name for s in self.subscores] + dupes = [n for n in names if names.count(n) > 1] + if dupes: + warnings.warn(f"Duplicate subscore names: {set(dupes)}", stacklevel=2) + pos_weight_sum = sum(s.weight for s in self.subscores if s.weight > 0) + if abs(pos_weight_sum - 1.0) > 0.01: + warnings.warn( + f"Positive subscore weights should sum to ~1.0 (got {pos_weight_sum:.4f}). " + f"Weights represent proportional contributions to the reward.", + stacklevel=2, + ) + weighted_sum = sum(s.value * s.weight for s in self.subscores) + if abs(weighted_sum - self.reward) > 0.01: + warnings.warn( + f"Subscores don't match reward: " + f"sum(value*weight)={weighted_sum:.4f} but reward={self.reward:.4f}", + stacklevel=2, + ) + return self + @classmethod def from_float(cls, value: float) -> EvaluationResult: """Create an EvaluationResult from a simple float reward. From 5f2af134bb1b3c7ddbd8a896b64f08b31daceea5 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 27 Feb 2026 22:18:19 -0800 Subject: [PATCH 09/14] ruff --- hud/tools/types.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/hud/tools/types.py b/hud/tools/types.py index 83c741bc..7e014cfa 100644 --- a/hud/tools/types.py +++ b/hud/tools/types.py @@ -1,10 +1,9 @@ from __future__ import annotations +import warnings from typing import Any from mcp.types import ContentBlock, ImageContent, TextContent -import warnings - from pydantic import BaseModel, ConfigDict, Field, model_validator @@ -38,7 +37,11 @@ class SubScore(BaseModel): model_config = ConfigDict(extra="forbid") name: str = Field(..., description="Name of this subscore component") - weight: float = Field(default=1.0, description="Weight of this subscore (for weighted average). Negative weights represent penalties.") + weight: float = Field( + default=1.0, + description="Weight of this subscore (for weighted average). " + "Negative weights represent penalties.", + ) value: float = Field(..., ge=0.0, le=1.0, description="Value of this subscore, 0.0 to 1.0") metadata: dict[str, Any] | None = Field(default=None, exclude=True) From 437c79c544e4ae3beb370caf413b3714824451a9 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 27 Feb 2026 22:19:56 -0800 Subject: [PATCH 10/14] bump version to 0.5.29 --- hud/cli/tests/test_build.py | 4 ++-- hud/utils/tests/test_version.py | 2 +- hud/version.py | 2 +- pyproject.toml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/hud/cli/tests/test_build.py b/hud/cli/tests/test_build.py index c6ebadab..94cf51c0 100644 --- a/hud/cli/tests/test_build.py +++ b/hud/cli/tests/test_build.py @@ -60,12 +60,12 @@ def test_increment_patch(self): def test_increment_minor(self): """Test incrementing minor version.""" assert increment_version("1.2.3", "minor") == "1.3.0" - assert increment_version("0.5.28", "minor") == "0.6.0" + assert increment_version("0.5.29", "minor") == "0.6.0" def test_increment_major(self): """Test incrementing major version.""" assert increment_version("1.2.3", "major") == "2.0.0" - assert increment_version("0.5.28", "major") == "1.0.0" + assert increment_version("0.5.29", "major") == "1.0.0" def test_increment_with_v_prefix(self): """Test incrementing version with v prefix.""" diff --git a/hud/utils/tests/test_version.py b/hud/utils/tests/test_version.py index ada8b553..16c8ade8 100644 --- a/hud/utils/tests/test_version.py +++ b/hud/utils/tests/test_version.py @@ -5,4 +5,4 @@ def test_import(): """Test that the package can be imported.""" import hud - assert hud.__version__ == "0.5.28" + assert hud.__version__ == "0.5.29" diff --git a/hud/version.py b/hud/version.py index 7f525011..c7b914db 100644 --- a/hud/version.py +++ b/hud/version.py @@ -4,4 +4,4 @@ from __future__ import annotations -__version__ = "0.5.28" +__version__ = "0.5.29" diff --git a/pyproject.toml b/pyproject.toml index 910ae2ee..55999bcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "hud-python" -version = "0.5.28" +version = "0.5.29" description = "SDK for the HUD platform." readme = "README.md" requires-python = ">=3.11, <3.13" From 305290d892f3872bb31798ee7da7781d3508d777 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 27 Feb 2026 23:43:54 -0800 Subject: [PATCH 11/14] enhance error handling and job registration in evaluation process --- hud/cli/eval.py | 15 +++++++++---- hud/eval/manager.py | 54 +++++++++++++++++---------------------------- 2 files changed, 31 insertions(+), 38 deletions(-) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index bb386245..f0b58768 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -621,15 +621,22 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: hud_console.info(f"📊 Loading tasks from: {cfg.source}…") path = Path(cfg.source) taskset_id: str | None = None - if path.exists() and path.suffix in {".json", ".jsonl"}: - tasks = _load_from_file(path) - else: - tasks, taskset_id = _load_from_api(cfg.source) + try: + if path.exists() and path.suffix in {".json", ".jsonl"}: + tasks = _load_from_file(path) + else: + tasks, taskset_id = _load_from_api(cfg.source) + except Exception as e: + hud_console.error(f"Failed to load tasks from {cfg.source}: {e}") + raise typer.Exit(1) from e if not tasks: hud_console.error(f"No tasks found in: {cfg.source}") raise typer.Exit(1) + if cfg.taskset: + taskset_id = cfg.taskset + # Filter by task slugs (or positional indices) if provided if cfg.task_ids: selector_set = set(cfg.task_ids) diff --git a/hud/eval/manager.py b/hud/eval/manager.py index f2d793bc..78d552a5 100644 --- a/hud/eval/manager.py +++ b/hud/eval/manager.py @@ -238,9 +238,21 @@ async def run_eval( # Lazy import to avoid circular dependency from hud.eval.context import EvalContext + # Register job if not already provided by caller + eval_name = _get_eval_name(tasks=tasks, group=group) + if not job_id and (taskset_id or total_evals > 1): + job_id = str(uuid.uuid4()) + await _send_job_enter( + job_id=job_id, + name=eval_name, + variants=variants, + group=group, + api_key=api_key, + taskset_id=taskset_id, + ) + if total_evals == 1: if tasks: - # Single task - use EvalContext.from_task() ctx = EvalContext.from_task( tasks[0], name=name, @@ -256,7 +268,6 @@ async def run_eval( async with ctx: yield ctx else: - # Blank eval - use EvalContext directly ctx = EvalContext( name=name or "eval", trace_id=trace_id, @@ -272,35 +283,19 @@ async def run_eval( yield ctx else: - # Parallel execution: create implicit job to group traces - eval_name = _get_eval_name(tasks=tasks, group=group) - implicit_job_id = job_id or str(uuid.uuid4()) - job_url = f"https://hud.ai/jobs/{implicit_job_id}" - - # Register job if not already provided by caller - if not job_id: - await _send_job_enter( - job_id=implicit_job_id, - name=eval_name, - variants=variants, - group=group, - api_key=api_key, - taskset_id=taskset_id, - ) + job_url = f"https://hud.ai/jobs/{job_id}" - # Print job URL (not individual trace URLs) if not quiet: print_link(job_url, f"🚀 {eval_name}") error_occurred = False try: - # Run parallel evals with job_id completed = await _run_parallel_eval( tasks=tasks, variant_combos=variant_combos, group=group, group_ids=group_ids, - job_id=implicit_job_id, # Propagate job_id to child traces + job_id=job_id, api_key=api_key, code_snippet=code_snippet, max_concurrent=max_concurrent, @@ -308,20 +303,11 @@ async def run_eval( quiet=quiet, ) - # Create summary context (no trace, just aggregates results) - if tasks: - # Create summary from first task - ctx = EvalContext( - name=eval_name, # Use the same smart name - api_key=api_key, - job_id=implicit_job_id, - ) - else: - ctx = EvalContext( - name="eval", - api_key=api_key, - job_id=implicit_job_id, - ) + ctx = EvalContext( + name=eval_name, + api_key=api_key, + job_id=job_id, + ) ctx._is_summary = True # Skip trace tracking ctx.results = completed From 0aed66402ad417ff9c524541a716ad3b5d46c9c3 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 27 Feb 2026 23:56:39 -0800 Subject: [PATCH 12/14] tests --- hud/eval/tests/test_manager.py | 88 +++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/hud/eval/tests/test_manager.py b/hud/eval/tests/test_manager.py index 9b237382..2afd73a3 100644 --- a/hud/eval/tests/test_manager.py +++ b/hud/eval/tests/test_manager.py @@ -7,7 +7,8 @@ import pytest from hud.eval.context import EvalContext, get_current_trace_headers -from hud.eval.manager import run_eval +from hud.eval.manager import _get_eval_name, run_eval +from hud.eval.task import Task class TestRunEvalNoArgs: @@ -150,3 +151,88 @@ async def test_error_tracked_on_exception(self) -> None: error_msg = mock_exit.call_args[0][0] assert error_msg is not None assert "test error" in error_msg + + +class TestGetEvalName: + """Tests for _get_eval_name() naming convention.""" + + def test_no_tasks(self) -> None: + assert _get_eval_name() == "Task Run: eval" + + def test_no_tasks_with_group(self) -> None: + assert _get_eval_name(group=4) == "Task Run: eval (4 times)" + + def test_single_task_with_scenario(self) -> None: + tasks = [Task(env={"name": "browser"}, scenario="checkout")] + assert _get_eval_name(tasks=tasks) == "Task Run: checkout" + + def test_single_task_with_scenario_and_group(self) -> None: + tasks = [Task(env={"name": "browser"}, scenario="checkout")] + assert _get_eval_name(tasks=tasks, group=4) == "Task Run: checkout (4 times)" + + def test_single_task_no_scenario_uses_env_name(self) -> None: + tasks = [Task(env={"name": "my-env"})] + assert _get_eval_name(tasks=tasks) == "Task Run: my-env" + + def test_multiple_tasks(self) -> None: + tasks = [ + Task(env={"name": "browser"}, scenario="checkout"), + Task(env={"name": "browser"}, scenario="login"), + ] + assert _get_eval_name(tasks=tasks) == "Batch Run: 2 tasks" + + def test_multiple_tasks_with_group(self) -> None: + tasks = [ + Task(env={"name": "browser"}, scenario="checkout"), + Task(env={"name": "browser"}, scenario="login"), + Task(env={"name": "browser"}, scenario="search"), + ] + assert _get_eval_name(tasks=tasks, group=3) == "Batch Run: 3 tasks (3 times)" + + +class TestRunEvalTasksetId: + """Tests for taskset_id flow through run_eval.""" + + @pytest.mark.asyncio + async def test_taskset_id_triggers_job_registration(self) -> None: + """run_eval(taskset_id=...) registers a job even for single task.""" + with ( + patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock), + patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock), + patch("hud.eval.manager._send_job_enter", new_callable=AsyncMock) as mock_enter, + ): + async with run_eval(taskset_id="ts-123", quiet=True) as ctx: + pass + + mock_enter.assert_called_once() + call_kwargs = mock_enter.call_args[1] + assert call_kwargs["taskset_id"] == "ts-123" + assert ctx.job_id == call_kwargs["job_id"] + + @pytest.mark.asyncio + async def test_no_taskset_no_job_for_single_task(self) -> None: + """run_eval() without taskset_id does not register a job for single task.""" + with ( + patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock), + patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock), + patch("hud.eval.manager._send_job_enter", new_callable=AsyncMock) as mock_enter, + ): + async with run_eval(quiet=True) as ctx: + pass + + mock_enter.assert_not_called() + assert ctx.job_id is None + + @pytest.mark.asyncio + async def test_provided_job_id_skips_registration(self) -> None: + """run_eval(job_id=..., taskset_id=...) uses provided job_id without registering.""" + with ( + patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock), + patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock), + patch("hud.eval.manager._send_job_enter", new_callable=AsyncMock) as mock_enter, + ): + async with run_eval(job_id="existing-job", taskset_id="ts-123", quiet=True) as ctx: + pass + + mock_enter.assert_not_called() + assert ctx.job_id == "existing-job" From 3ed7330bdfd3a3d0a9ea99d8fb4d574ba4f6dce2 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Sat, 28 Feb 2026 00:15:27 -0800 Subject: [PATCH 13/14] taskset name resolution --- hud/cli/eval.py | 14 +++++++++++--- hud/datasets/loader.py | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index f0b58768..7a1fb18d 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -186,7 +186,7 @@ class EvalConfig(BaseModel): remote: bool = False quiet: bool = False # Suppress opening browser for eval links gateway: bool = False # Use HUD Gateway for LLM API calls - taskset: str | None = None # Taskset slug to associate job with + taskset: str | None = None # Taskset name to associate job with # Base agent config (these merge with task's agent_config) allowed_tools: list[str] | None = None @@ -634,8 +634,16 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: hud_console.error(f"No tasks found in: {cfg.source}") raise typer.Exit(1) + # TODO: --taskset with file source should sync local tasks to the platform taskset + # (diff, save, then run). For now it just resolves the slug and associates the job. if cfg.taskset: - taskset_id = cfg.taskset + from hud.datasets.loader import resolve_taskset_id + + try: + taskset_id = resolve_taskset_id(cfg.taskset) + except Exception as e: + hud_console.error(f"Failed to resolve taskset '{cfg.taskset}': {e}") + raise typer.Exit(1) from e # Filter by task slugs (or positional indices) if provided if cfg.task_ids: @@ -827,7 +835,7 @@ def eval_command( False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway" ), taskset: str | None = typer.Option( - None, "--taskset", "-t", help="Taskset slug to associate job with" + None, "--taskset", "-t", help="Taskset name to associate job with" ), ) -> None: """🚀 Run evaluation on datasets or individual tasks with agents. diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py index ebc0adde..63b355ee 100644 --- a/hud/datasets/loader.py +++ b/hud/datasets/loader.py @@ -70,6 +70,26 @@ def _load_from_file(path: Path) -> list[Task]: return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items] +def resolve_taskset_id(slug: str) -> str: + """Resolve a taskset slug/name to its UUID via the HUD API.""" + headers = {} + if settings.api_key: + headers["Authorization"] = f"Bearer {settings.api_key}" + + with httpx.Client() as client: + response = client.get( + f"{settings.hud_api_url}/tasks/evalset/{slug}", + headers=headers, + ) + response.raise_for_status() + data = response.json() + + evalset_id = data.get("evalset_id") + if not evalset_id: + raise ValueError(f"Could not resolve taskset '{slug}' — not found or no access") + return evalset_id + + def _load_raw_from_api(dataset_name: str) -> tuple[list[dict[str, Any]], str | None]: """Load raw task dicts from HUD API. From 7df816120b34a09755daadbc94420d015b8b0e41 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Sat, 28 Feb 2026 00:37:32 -0800 Subject: [PATCH 14/14] docs update --- docs/platform/tasksets.mdx | 2 +- docs/quick-links/evals.mdx | 11 +++++--- docs/reference/cli/eval.mdx | 56 +++++++++++++++++++++++++++---------- docs/reference/evals.mdx | 1 + hud/cli/eval.py | 25 +++-------------- hud/datasets/loader.py | 6 ++-- hud/datasets/utils.py | 7 ----- 7 files changed, 59 insertions(+), 49 deletions(-) diff --git a/docs/platform/tasksets.mdx b/docs/platform/tasksets.mdx index 3427aab0..70d5df4f 100644 --- a/docs/platform/tasksets.mdx +++ b/docs/platform/tasksets.mdx @@ -116,7 +116,7 @@ Run all tasks in a taskset with one click: Or run from the CLI: ```bash -hud eval my-taskset --model gpt-4o --group-size 10 +hud eval "My Tasks" claude --full --group-size 10 ``` ## Task Configuration diff --git a/docs/quick-links/evals.mdx b/docs/quick-links/evals.mdx index f7e60947..71dffe11 100644 --- a/docs/quick-links/evals.mdx +++ b/docs/quick-links/evals.mdx @@ -85,14 +85,17 @@ See [Platform Models](/platform/models) for training details. Prefer the command line? Use `hud eval` for running evaluations locally or remotely: ```bash -# Run a taskset with a model -hud eval my-taskset claude --full +# Run a platform taskset with a model +hud eval "My Tasks" claude --full # Run with multiple repeats for variance -hud eval my-taskset claude --full --group-size 5 +hud eval "My Tasks" claude --full --group-size 5 # Run remotely on HUD infrastructure -hud eval my-taskset claude --full --remote +hud eval "My Tasks" claude --full --remote + +# Run from a local file, linked to a platform taskset +hud eval tasks.json claude --full --taskset "My Tasks" ``` See [`hud eval` CLI reference](/reference/cli/eval) for all options. diff --git a/docs/reference/cli/eval.mdx b/docs/reference/cli/eval.mdx index da2f8219..d10e361f 100644 --- a/docs/reference/cli/eval.mdx +++ b/docs/reference/cli/eval.mdx @@ -4,7 +4,7 @@ description: "Run agents on tasks or datasets" icon: "robot" --- -The `hud eval` command runs an agent on a tasks file or HuggingFace dataset. +The `hud eval` command runs an agent on a tasks file or platform taskset. **Local Execution Dependencies**: Running Claude or Gemini agents locally requires additional packages: @@ -23,7 +23,7 @@ hud eval [SOURCE] [AGENT] [OPTIONS] ## Arguments - HuggingFace dataset (e.g., `hud-evals/SheetBench-50`) or task JSON/JSONL file. + Platform taskset name (e.g., `My Tasks`) or local task JSON/JSONL file. When loading from a platform taskset, the job is automatically associated with that taskset. @@ -82,6 +82,18 @@ hud eval [SOURCE] [AGENT] [OPTIONS] Use ResponseAgent to decide when to stop/continue. Default: True for `--full`. +### Taskset Association + + + Taskset name to associate the job with. Resolves the name to a taskset UUID on the platform. Useful when running from a local file but wanting the job to appear under a platform taskset. + + +### LLM Routing + + + Route LLM API calls through HUD Gateway. If you have API keys stored on the platform, they're used automatically (BYOK) at a lower credit cost. Otherwise, pooled keys are used. + + ### Output & Confirmation @@ -92,6 +104,10 @@ hud eval [SOURCE] [AGENT] [OPTIONS] Enable debug-level logs. + + Suppress opening the browser for eval links. + + Skip confirmation prompt. @@ -107,14 +123,16 @@ On first run, a template is created: ```toml # .hud_eval.toml [eval] -# source = "hud-evals/SheetBench-50" +# source = "My Tasks" # agent = "claude" # full = false # max_concurrent = 30 # max_steps = 10 # group_size = 1 -# task_ids = ["task_1", "task_2"] +# task_ids = ["checkout-smoke", "0"] # slugs or 0-based indices # auto_respond = true +# gateway = false +# quiet = false [agent] # allowed_tools = ["computer", "playwright"] @@ -141,11 +159,17 @@ On first run, a template is created: ## Examples ```bash -# Single task (debug mode) -hud eval tasks.json claude +# Run a platform taskset (single task) +hud eval "My Tasks" claude + +# Full taskset evaluation +hud eval "My Tasks" claude --full -# Full dataset evaluation -hud eval hud-evals/SheetBench-50 claude --full +# Run from a local file +hud eval tasks.json claude --full + +# Local file, associated with a platform taskset +hud eval tasks.json claude --full --taskset "My Tasks" # Run specific tasks by ID hud eval tasks.json claude --task-ids task_1,task_5 @@ -158,13 +182,16 @@ hud eval tasks.json claude --config max_tokens=32768 hud eval tasks.json openai --config temperature=0.7 # High concurrency -hud eval hud-evals/SheetBench-50 claude --full --max-concurrent 100 +hud eval "My Tasks" claude --full --max-concurrent 100 # Variance estimation (run each task 3 times) -hud eval tasks.json claude --full --group-size 3 +hud eval "My Tasks" claude --full --group-size 3 # Remote execution on HUD platform -hud eval hud-evals/SheetBench-50 claude --full --remote +hud eval "My Tasks" claude --full --remote + +# Route through HUD Gateway (no provider API keys needed) +hud eval tasks.json claude --full --gateway # OpenAI-compatible endpoint (vLLM, Ollama, etc.) hud eval tasks.json openai_compatible \ @@ -187,8 +214,9 @@ When agent is omitted, an interactive selector shows presets: ❯ Claude Sonnet 4.5 GPT-5 Operator (OpenAI Computer Use) - Gemini 2.5 Computer Use - Grok 4.1 Fast + Gemini 3 Pro Preview + Gemini CUA (Gemini Computer Use) + Grok 4-1 Fast (xAI) ``` ## Remote Execution @@ -196,7 +224,7 @@ When agent is omitted, an interactive selector shows presets: With `--remote`, both the **agent** and **environment** run on HUD infrastructure: ```bash -hud eval hud-evals/SheetBench-50 claude --full --remote +hud eval "My Tasks" claude --full --remote ``` - **Remote agent**: Runs on HUD workers (no local compute needed) diff --git a/docs/reference/evals.mdx b/docs/reference/evals.mdx index 0a19a6e1..e2471d92 100644 --- a/docs/reference/evals.mdx +++ b/docs/reference/evals.mdx @@ -26,6 +26,7 @@ async with hud.eval() as ctx: | `group` | `int` | Runs per variant for statistical significance | `1` | | `group_ids` | `list[str] \| None` | Custom group IDs for parallel runs | `None` | | `job_id` | `str \| None` | Job ID to link traces to | `None` | +| `taskset_id` | `str \| None` | Platform taskset UUID to associate the job with | `None` | | `api_key` | `str \| None` | API key for backend calls | `None` | | `max_concurrent` | `int \| None` | Maximum concurrent evaluations | `None` | | `trace` | `bool` | Send telemetry to backend | `True` | diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 7a1fb18d..9e9db45f 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -96,7 +96,6 @@ class AgentPreset: # max_concurrent = 30 # max_steps = 10 # group_size = 1 -# byok = false # Remote only; use encrypted env vars on the platform. # task_ids = ["checkout-smoke", "0"] # slugs or 0-based indices # verbose = true # very_verbose = true @@ -160,7 +159,6 @@ class EvalConfig(BaseModel): "verbose", "very_verbose", "group_size", - "byok", "remote", "auto_respond", "quiet", @@ -182,7 +180,6 @@ class EvalConfig(BaseModel): very_verbose: bool = False auto_respond: bool | None = None # Continue without prompting group_size: int = 1 - byok: bool = False remote: bool = False quiet: bool = False # Suppress opening browser for eval links gateway: bool = False # Use HUD Gateway for LLM API calls @@ -214,11 +211,6 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None: def validate_api_keys(self) -> None: """Validate required API keys for the selected agent. Raises typer.Exit on failure.""" - # BYOK requires remote execution (check before agent_type guard) - if self.byok and not self.remote: - hud_console.error("--byok requires --remote (BYOK only works with remote execution)") - raise typer.Exit(1) - if self.agent_type is None: return @@ -547,8 +539,6 @@ def display(self) -> None: table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)") if self.gateway: table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)") - if self.byok: - table.add_row("byok", "[bold green]True[/bold green] (remote only)") # Tool filters (only if set) if self.allowed_tools: @@ -726,7 +716,6 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: agent_params=agent_kwargs, max_steps=max_steps, group_size=cfg.group_size, - use_byok=cfg.byok, ) if not trace_ids: @@ -823,11 +812,6 @@ def eval_command( remote: bool = typer.Option( False, "--remote", help="Submit tasks to platform for remote execution" ), - byok: bool = typer.Option( - False, - "--byok", - help="Remote only: use BYOK keys from encrypted env vars for inference", - ), quiet: bool = typer.Option( False, "--quiet", "-q", help="Suppress opening browser for eval links" ), @@ -842,11 +826,11 @@ def eval_command( Examples: hud eval tasks.json claude - hud eval hud-evals/SheetBench-50 claude --full + hud eval "My Tasks" claude --full # Load from platform taskset + hud eval tasks.json claude --taskset "My Tasks" # Associate file tasks with taskset hud eval tasks.json claude --config max_tokens=32768 - hud eval tasks.json openai --config temperature=0.7 - hud eval tasks.json claude --full --remote # Remote execution - hud eval tasks.json claude --gateway # Route LLM calls through HUD Gateway + hud eval tasks.json claude --full --remote # Remote execution + hud eval tasks.json claude --gateway # Route LLM calls through HUD Gateway """ hud_console.info("🔧 Initializing evaluation...") @@ -877,7 +861,6 @@ def eval_command( group_size=group_size, config=config, remote=remote, - byok=byok, quiet=quiet, gateway=gateway, taskset=taskset, diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py index 63b355ee..51b2d7df 100644 --- a/hud/datasets/loader.py +++ b/hud/datasets/loader.py @@ -22,7 +22,7 @@ logger = logging.getLogger(__name__) -__all__ = ["load_dataset", "load_tasks", "save_tasks"] +__all__ = ["load_dataset", "load_tasks", "resolve_taskset_id", "save_tasks"] def _load_raw_from_file(path: Path) -> list[dict[str, Any]]: @@ -162,7 +162,9 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, - If raw=True: list[dict] with raw task data Raises: - ValueError: If task loading fails + httpx.HTTPStatusError: If API returns an error (e.g., 404 for unknown taskset). + httpx.ConnectError: If API is unreachable. + ValueError: If file format is invalid. """ # Check if it's a local file path = Path(source) diff --git a/hud/datasets/utils.py b/hud/datasets/utils.py index f218278a..9cd64057 100644 --- a/hud/datasets/utils.py +++ b/hud/datasets/utils.py @@ -51,10 +51,6 @@ class SingleTaskRequest(BaseModel): description="Additional metadata to inject into the trace context.", ) trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.") - use_byok: bool = Field( - default=False, - description="If True, use BYOK headers from encrypted env vars for inference.", - ) @model_validator(mode="after") def _validate_task(self) -> SingleTaskRequest: @@ -125,7 +121,6 @@ async def submit_rollouts( group_size: int = 1, batch_size: int = 50, metadata: dict[str, Any] | None = None, - use_byok: bool = False, ) -> list[str]: """Submit rollouts to the HUD platform API for remote execution. @@ -140,7 +135,6 @@ async def submit_rollouts( group_size: Number of rollouts per task (for variance estimation) batch_size: Number of rollouts per API batch request metadata: Additional metadata for each rollout - use_byok: If True, use BYOK keys from encrypted env vars (remote only) """ from hud.eval.utils import is_v4_format @@ -189,7 +183,6 @@ async def submit_rollouts( trace_name=trace_name, group_id=base_task_id if group_size > 1 else None, metadata=metadata or {}, - use_byok=use_byok, ) )