hud-evals · jdchawla29 · Feb 28, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/docs/platform/tasksets.mdx b/docs/platform/tasksets.mdx
@@ -116,7 +116,7 @@ Run all tasks in a taskset with one click:
 Or run from the CLI:
 
 ```bash
-hud eval my-taskset --model gpt-4o --group-size 10
+hud eval "My Tasks" claude --full --group-size 10
 ```
 
 ## Task Configuration

diff --git a/docs/quick-links/evals.mdx b/docs/quick-links/evals.mdx
@@ -85,14 +85,17 @@ See [Platform Models](/platform/models) for training details.
 Prefer the command line? Use `hud eval` for running evaluations locally or remotely:
 
 ```bash
-# Run a taskset with a model
-hud eval my-taskset claude --full
+# Run a platform taskset with a model
+hud eval "My Tasks" claude --full
 
 # Run with multiple repeats for variance
-hud eval my-taskset claude --full --group-size 5
+hud eval "My Tasks" claude --full --group-size 5
 
 # Run remotely on HUD infrastructure
-hud eval my-taskset claude --full --remote
+hud eval "My Tasks" claude --full --remote
+
+# Run from a local file, linked to a platform taskset
+hud eval tasks.json claude --full --taskset "My Tasks"
 ```
 
 See [`hud eval` CLI reference](/reference/cli/eval) for all options.

diff --git a/docs/reference/cli/eval.mdx b/docs/reference/cli/eval.mdx
@@ -4,7 +4,7 @@ description: "Run agents on tasks or datasets"
 icon: "robot"
 ---
 
-The `hud eval` command runs an agent on a tasks file or HuggingFace dataset.
+The `hud eval` command runs an agent on a tasks file or platform taskset.
 
 <Note>
 **Local Execution Dependencies**: Running Claude or Gemini agents locally requires additional packages:
@@ -23,7 +23,7 @@ hud eval [SOURCE] [AGENT] [OPTIONS]
 ## Arguments
 
 <ParamField path="source" type="string">
-  HuggingFace dataset (e.g., `hud-evals/SheetBench-50`) or task JSON/JSONL file.
+  Platform taskset name (e.g., `My Tasks`) or local task JSON/JSONL file. When loading from a platform taskset, the job is automatically associated with that taskset.
 </ParamField>
 
 <ParamField path="agent" type="string">
@@ -82,6 +82,18 @@ hud eval [SOURCE] [AGENT] [OPTIONS]
   Use ResponseAgent to decide when to stop/continue. Default: True for `--full`.
 </ParamField>
 
+### Taskset Association
+
+<ParamField path="--taskset" type="string">
+  Taskset name to associate the job with. Resolves the name to a taskset UUID on the platform. Useful when running from a local file but wanting the job to appear under a platform taskset.
+</ParamField>
+
+### LLM Routing
+
+<ParamField path="--gateway" type="boolean" default="false">
+  Route LLM API calls through HUD Gateway. If you have API keys stored on the platform, they're used automatically (BYOK) at a lower credit cost. Otherwise, pooled keys are used.
+</ParamField>
+
 ### Output & Confirmation
 
 <ParamField path="--verbose, -v" type="boolean" default="false">
@@ -92,6 +104,10 @@ hud eval [SOURCE] [AGENT] [OPTIONS]
   Enable debug-level logs.
 </ParamField>
 
+<ParamField path="--quiet" type="boolean" default="false">
+  Suppress opening the browser for eval links.
+</ParamField>
+
 <ParamField path="--yes, -y" type="boolean" default="false">
   Skip confirmation prompt.
 </ParamField>
@@ -107,14 +123,16 @@ On first run, a template is created:
 ```toml
 # .hud_eval.toml
 [eval]
-# source = "hud-evals/SheetBench-50"
+# source = "My Tasks"
 # agent = "claude"
 # full = false
 # max_concurrent = 30
 # max_steps = 10
 # group_size = 1
-# task_ids = ["task_1", "task_2"]
+# task_ids = ["checkout-smoke", "0"]  # slugs or 0-based indices
 # auto_respond = true
+# gateway = false
+# quiet = false
 
 [agent]
 # allowed_tools = ["computer", "playwright"]
@@ -141,11 +159,17 @@ On first run, a template is created:
 ## Examples
 
 ```bash
-# Single task (debug mode)
-hud eval tasks.json claude
+# Run a platform taskset (single task)
+hud eval "My Tasks" claude
+
+# Full taskset evaluation
+hud eval "My Tasks" claude --full
 
-# Full dataset evaluation
-hud eval hud-evals/SheetBench-50 claude --full
+# Run from a local file
+hud eval tasks.json claude --full
+
+# Local file, associated with a platform taskset
+hud eval tasks.json claude --full --taskset "My Tasks"
 
 # Run specific tasks by ID
 hud eval tasks.json claude --task-ids task_1,task_5
@@ -158,13 +182,16 @@ hud eval tasks.json claude --config max_tokens=32768
 hud eval tasks.json openai --config temperature=0.7
 
 # High concurrency
-hud eval hud-evals/SheetBench-50 claude --full --max-concurrent 100
+hud eval "My Tasks" claude --full --max-concurrent 100
 
 # Variance estimation (run each task 3 times)
-hud eval tasks.json claude --full --group-size 3
+hud eval "My Tasks" claude --full --group-size 3
 
 # Remote execution on HUD platform
-hud eval hud-evals/SheetBench-50 claude --full --remote
+hud eval "My Tasks" claude --full --remote
+
+# Route through HUD Gateway (no provider API keys needed)
+hud eval tasks.json claude --full --gateway
 
 # OpenAI-compatible endpoint (vLLM, Ollama, etc.)
 hud eval tasks.json openai_compatible \
@@ -187,16 +214,17 @@ When agent is omitted, an interactive selector shows presets:
 ❯ Claude Sonnet 4.5
   GPT-5
   Operator (OpenAI Computer Use)
-  Gemini 2.5 Computer Use
-  Grok 4.1 Fast
+  Gemini 3 Pro Preview
+  Gemini CUA (Gemini Computer Use)
+  Grok 4-1 Fast (xAI)
 ```
 
 ## Remote Execution
 
 With `--remote`, both the **agent** and **environment** run on HUD infrastructure:
 
 ```bash
-hud eval hud-evals/SheetBench-50 claude --full --remote
+hud eval "My Tasks" claude --full --remote
 ```
 
 - **Remote agent**: Runs on HUD workers (no local compute needed)

diff --git a/docs/reference/evals.mdx b/docs/reference/evals.mdx
@@ -26,6 +26,7 @@ async with hud.eval() as ctx:
 | `group` | `int` | Runs per variant for statistical significance | `1` |
 | `group_ids` | `list[str] \| None` | Custom group IDs for parallel runs | `None` |
 | `job_id` | `str \| None` | Job ID to link traces to | `None` |
+| `taskset_id` | `str \| None` | Platform taskset UUID to associate the job with | `None` |
 | `api_key` | `str \| None` | API key for backend calls | `None` |
 | `max_concurrent` | `int \| None` | Maximum concurrent evaluations | `None` |
 | `trace` | `bool` | Send telemetry to backend | `True` |

diff --git a/hud/cli/eval.py b/hud/cli/eval.py
@@ -96,7 +96,6 @@ class AgentPreset:
 # max_concurrent = 30
 # max_steps = 10
 # group_size = 1
-# byok = false  # Remote only; use encrypted env vars on the platform.
 # task_ids = ["checkout-smoke", "0"]  # slugs or 0-based indices
 # verbose = true
 # very_verbose = true
@@ -160,7 +159,6 @@ class EvalConfig(BaseModel):
         "verbose",
         "very_verbose",
         "group_size",
-        "byok",
         "remote",
         "auto_respond",
         "quiet",
@@ -182,11 +180,10 @@ class EvalConfig(BaseModel):
     very_verbose: bool = False
     auto_respond: bool | None = None  # Continue without prompting
     group_size: int = 1
-    byok: bool = False
     remote: bool = False
     quiet: bool = False  # Suppress opening browser for eval links
     gateway: bool = False  # Use HUD Gateway for LLM API calls
-    taskset: str | None = None  # Taskset slug to associate job with
+    taskset: str | None = None  # Taskset name to associate job with
 
     # Base agent config (these merge with task's agent_config)
     allowed_tools: list[str] | None = None
@@ -214,11 +211,6 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None:
 
     def validate_api_keys(self) -> None:
         """Validate required API keys for the selected agent. Raises typer.Exit on failure."""
-        # BYOK requires remote execution (check before agent_type guard)
-        if self.byok and not self.remote:
-            hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
-            raise typer.Exit(1)
-
         if self.agent_type is None:
             return
 
@@ -547,8 +539,6 @@ def display(self) -> None:
             table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
         if self.gateway:
             table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
-        if self.byok:
-            table.add_row("byok", "[bold green]True[/bold green] (remote only)")
 
         # Tool filters (only if set)
         if self.allowed_tools:
@@ -609,19 +599,42 @@ def display(self) -> None:
 
 async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
     """Run evaluation with the given config using run_dataset()."""
-    from hud.datasets import load_tasks, run_dataset
+    from pathlib import Path
+
+    from hud.datasets import run_dataset
+    from hud.datasets.loader import _load_from_api, _load_from_file
 
     if cfg.source is None or cfg.agent_type is None:
         raise ValueError("source and agent_type must be set")
 
-    # Load tasks using unified loader (handles v4→v5 conversion automatically)
+    # Load tasks — use internal loaders to capture taskset_id from API sources
     hud_console.info(f"📊 Loading tasks from: {cfg.source}…")
-    tasks = load_tasks(cfg.source)
+    path = Path(cfg.source)
+    taskset_id: str | None = None
+    try:
+        if path.exists() and path.suffix in {".json", ".jsonl"}:
+            tasks = _load_from_file(path)
+        else:
+            tasks, taskset_id = _load_from_api(cfg.source)
+    except Exception as e:
+        hud_console.error(f"Failed to load tasks from {cfg.source}: {e}")
+        raise typer.Exit(1) from e
 
     if not tasks:
         hud_console.error(f"No tasks found in: {cfg.source}")
         raise typer.Exit(1)
 
+    # TODO: --taskset with file source should sync local tasks to the platform taskset
+    # (diff, save, then run). For now it just resolves the slug and associates the job.
+    if cfg.taskset:
+        from hud.datasets.loader import resolve_taskset_id
+
+        try:
+            taskset_id = resolve_taskset_id(cfg.taskset)
+        except Exception as e:
+            hud_console.error(f"Failed to resolve taskset '{cfg.taskset}': {e}")
+            raise typer.Exit(1) from e
+
     # Filter by task slugs (or positional indices) if provided
     if cfg.task_ids:
         selector_set = set(cfg.task_ids)
@@ -650,15 +663,16 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
 
     max_steps = cfg.max_steps
 
+    import uuid
+
+    from hud.eval.manager import _get_eval_name, _send_job_enter
+
     # Remote execution - submit to HUD platform
     if cfg.remote:
         agent_kwargs = {
             k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
         }
-        import uuid
-
         from hud.datasets.utils import submit_rollouts
-        from hud.eval.manager import _send_job_enter
 
         job_id = str(uuid.uuid4())
         hud_console.info(
@@ -687,11 +701,11 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
 
         await _send_job_enter(
             job_id=job_id,
-            name=f"eval ({cfg.source})" if cfg.source else "eval",
+            name=_get_eval_name(tasks=tasks, group=cfg.group_size),
             variants=None,
             group=cfg.group_size,
             api_key=None,
-            taskset=cfg.taskset,
+            taskset_id=taskset_id,
             hud_eval_config=eval_cfg_dict,
         )
 
@@ -702,7 +716,6 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
             agent_params=agent_kwargs,
             max_steps=max_steps,
             group_size=cfg.group_size,
-            use_byok=cfg.byok,
         )
 
         if not trace_ids:
@@ -734,7 +747,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
         max_concurrent=cfg.max_concurrent,
         group_size=cfg.group_size,
         quiet=cfg.quiet,
-        taskset=cfg.taskset,
+        taskset_id=taskset_id,
     )
 
     # Show reward for single task
@@ -750,7 +763,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
 
 
 def eval_command(
-    source: str | None = typer.Argument(None, help="HuggingFace dataset or task JSON file"),
+    source: str | None = typer.Argument(None, help="Taskset slug or task JSON file"),
     agent: str | None = typer.Argument(
         None,
         help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test",  # noqa: E501
@@ -799,30 +812,25 @@ def eval_command(
     remote: bool = typer.Option(
         False, "--remote", help="Submit tasks to platform for remote execution"
     ),
-    byok: bool = typer.Option(
-        False,
-        "--byok",
-        help="Remote only: use BYOK keys from encrypted env vars for inference",
-    ),
     quiet: bool = typer.Option(
         False, "--quiet", "-q", help="Suppress opening browser for eval links"
     ),
     gateway: bool = typer.Option(
         False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
     ),
     taskset: str | None = typer.Option(
-        None, "--taskset", "-t", help="Taskset slug to associate job with"
+        None, "--taskset", "-t", help="Taskset name to associate job with"
     ),
 ) -> None:
     """🚀 Run evaluation on datasets or individual tasks with agents.
 
     Examples:
         hud eval tasks.json claude
-        hud eval hud-evals/SheetBench-50 claude --full
+        hud eval "My Tasks" claude --full              # Load from platform taskset
+        hud eval tasks.json claude --taskset "My Tasks" # Associate file tasks with taskset
         hud eval tasks.json claude --config max_tokens=32768
-        hud eval tasks.json openai --config temperature=0.7
-        hud eval tasks.json claude --full --remote  # Remote execution
-        hud eval tasks.json claude --gateway  # Route LLM calls through HUD Gateway
+        hud eval tasks.json claude --full --remote     # Remote execution
+        hud eval tasks.json claude --gateway           # Route LLM calls through HUD Gateway
     """
     hud_console.info("🔧 Initializing evaluation...")
 
@@ -853,7 +861,6 @@ def eval_command(
         group_size=group_size,
         config=config,
         remote=remote,
-        byok=byok,
         quiet=quiet,
         gateway=gateway,
         taskset=taskset,

diff --git a/hud/cli/tests/test_build.py b/hud/cli/tests/test_build.py
@@ -60,12 +60,12 @@ def test_increment_patch(self):
     def test_increment_minor(self):
         """Test incrementing minor version."""
         assert increment_version("1.2.3", "minor") == "1.3.0"
-        assert increment_version("0.5.28", "minor") == "0.6.0"
+        assert increment_version("0.5.29", "minor") == "0.6.0"
 
     def test_increment_major(self):
         """Test incrementing major version."""
         assert increment_version("1.2.3", "major") == "2.0.0"
-        assert increment_version("0.5.28", "major") == "1.0.0"
+        assert increment_version("0.5.29", "major") == "1.0.0"
 
     def test_increment_with_v_prefix(self):
         """Test incrementing version with v prefix."""

diff --git a/hud/datasets/__init__.py b/hud/datasets/__init__.py
@@ -3,7 +3,7 @@
 Provides unified task loading, saving, and execution for HUD evaluations.
 
 Key functions:
-- load_tasks(): Load tasks from JSON, JSONL, HuggingFace, or HUD API
+- load_tasks(): Load tasks from JSON, JSONL, or HUD API
 - save_tasks(): Save tasks to the HUD API
 - run_dataset(): Run an agent on a dataset of tasks
 - submit_rollouts(): Submit tasks for remote execution