From e7d3d517578df09cd972ba147e2d7d04709bef1e Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 27 Feb 2026 11:15:24 -0800
Subject: [PATCH 01/14] j/fix-job-linking

---
 hud/cli/eval.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index d705f01d..1c251858 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -622,6 +622,10 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
         hud_console.error(f"No tasks found in: {cfg.source}")
         raise typer.Exit(1)
 
+    # Default taskset to source when loading from API (not a local file)
+    if cfg.taskset is None and not Path(cfg.source).exists():
+        cfg.taskset = cfg.source
+
     # Filter by task slugs (or positional indices) if provided
     if cfg.task_ids:
         selector_set = set(cfg.task_ids)

From c596dc135e60be6dc74bb4b4122de2acc9383445 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 27 Feb 2026 11:36:07 -0800
Subject: [PATCH 02/14] use taskset_id for API-loaded tasks

---
 hud/cli/eval.py        |  9 ++++-----
 hud/datasets/loader.py | 26 ++++++++++++++++++--------
 hud/datasets/runner.py |  2 ++
 hud/eval/manager.py    | 14 +++++++-------
 hud/eval/types.py      |  2 +-
 5 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index 1c251858..5ae8805a 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -622,9 +622,8 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
         hud_console.error(f"No tasks found in: {cfg.source}")
         raise typer.Exit(1)
 
-    # Default taskset to source when loading from API (not a local file)
-    if cfg.taskset is None and not Path(cfg.source).exists():
-        cfg.taskset = cfg.source
+    # Extract taskset_id from API-loaded tasks (set by loader in metadata)
+    taskset_id: str | None = tasks[0].metadata.get("taskset_id") if tasks else None
 
     # Filter by task slugs (or positional indices) if provided
     if cfg.task_ids:
@@ -695,7 +694,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
             variants=None,
             group=cfg.group_size,
             api_key=None,
-            taskset=cfg.taskset,
+            taskset_id=taskset_id,
             hud_eval_config=eval_cfg_dict,
         )
 
@@ -738,7 +737,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
         max_concurrent=cfg.max_concurrent,
         group_size=cfg.group_size,
         quiet=cfg.quiet,
-        taskset=cfg.taskset,
+        taskset_id=taskset_id,
     )
 
     # Show reward for single task
diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py
index 848e868c..48b6d8b0 100644
--- a/hud/datasets/loader.py
+++ b/hud/datasets/loader.py
@@ -108,8 +108,11 @@ def _load_from_huggingface(dataset_name: str) -> list[Task]:
     return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
 
 
-def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
-    """Load raw task dicts from HUD API."""
+def _load_raw_from_api(dataset_name: str) -> tuple[list[dict[str, Any]], str | None]:
+    """Load raw task dicts from HUD API.
+
+    Returns (tasks, taskset_id) tuple.
+    """
     from hud.datasets.utils import _normalize_task_dict
 
     headers = {}
@@ -125,23 +128,27 @@ def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
         response.raise_for_status()
         data = response.json()
 
-        # Extract tasks dict from response
+        taskset_id = data.get("evalset_id")
         tasks_dict = data.get("tasks", {})
 
-        return [
+        tasks = [
             _normalize_task_dict(task_data)
             for task_data in tasks_dict.values()
             if isinstance(task_data, dict)
         ]
+        return tasks, taskset_id
 
 
 def _load_from_api(dataset_name: str) -> list[Task]:
     """Load tasks from HUD API."""
     from hud.eval.task import Task
 
-    raw_items = _load_raw_from_api(dataset_name)
-    # Default args to {} for runnable tasks (None = template)
-    return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
+    raw_items, taskset_id = _load_raw_from_api(dataset_name)
+    tasks = [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
+    if taskset_id:
+        for task in tasks:
+            task.metadata["taskset_id"] = taskset_id
+    return tasks
 
 
 @overload
@@ -210,7 +217,10 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str,
     # Try HUD API first
     try:
         logger.info("Trying HUD API: %s", source)
-        items = _load_raw_from_api(source) if raw else _load_from_api(source)
+        if raw:
+            items, _ = _load_raw_from_api(source)
+        else:
+            items = _load_from_api(source)
         logger.info("Loaded %d tasks from HUD API: %s", len(items), source)
         return items
     except Exception as hud_error:
diff --git a/hud/datasets/runner.py b/hud/datasets/runner.py
index d4d9a586..a34c0a76 100644
--- a/hud/datasets/runner.py
+++ b/hud/datasets/runner.py
@@ -30,6 +30,7 @@ async def run_dataset(
     group_size: int = 1,
     quiet: bool = True,
     taskset: str | None = None,
+    taskset_id: str | None = None,
 ) -> list[EvalContext]:
     """Run an agent on a dataset of tasks.
 
@@ -98,6 +99,7 @@ async def run_dataset(
         max_concurrent=max_concurrent,
         quiet=quiet,
         taskset=taskset,
+        taskset_id=taskset_id,
     ) as ctx:
         # Build agent params - use system_prompt from ctx (set from task.agent_config)
         final_agent_params = dict(agent_params or {})
diff --git a/hud/eval/manager.py b/hud/eval/manager.py
index 19481232..4b3ffcd9 100644
--- a/hud/eval/manager.py
+++ b/hud/eval/manager.py
@@ -71,13 +71,12 @@ async def _send_job_enter(
     variants: dict[str, Any] | None,
     group: int,
     api_key: str | None,
-    taskset: str | None = None,
+    taskset_id: str | None = None,
     hud_eval_config: dict[str, Any] | None = None,
 ) -> None:
     """Send job enter payload (async request before traces start).
 
-    Registers the job with the platform. Tasks must already exist in the
-    taskset.
+    Registers the job with the platform.
     """
     import httpx
 
@@ -92,7 +91,7 @@ async def _send_job_enter(
         name=name,
         variants=variants,
         group=group,
-        taskset=taskset,
+        taskset_id=taskset_id,
         hud_eval_config=hud_eval_config,
     )
 
@@ -122,6 +121,7 @@ async def run_eval(
     trace: bool = True,
     quiet: bool = False,
     taskset: str | None = None,
+    taskset_id: str | None = None,
 ) -> AsyncGenerator[EvalContext, None]:
     """Standalone eval context manager.
 
@@ -253,7 +253,7 @@ async def run_eval(
     if total_evals == 1:
         if tasks:
             job_id_for_run = job_id
-            if taskset:
+            if taskset or taskset_id:
                 eval_name = _get_eval_name(tasks=tasks, group=group)
                 if job_id_for_run is None:
                     job_id_for_run = str(uuid.uuid4())
@@ -264,7 +264,7 @@ async def run_eval(
                     variants=variants,
                     group=group,
                     api_key=api_key,
-                    taskset=taskset,
+                    taskset_id=taskset_id,
                 )
 
             # Single task - use EvalContext.from_task()
@@ -311,7 +311,7 @@ async def run_eval(
             variants=variants,
             group=group,
             api_key=api_key,
-            taskset=taskset,
+            taskset_id=taskset_id,
         )
 
         # Print job URL (not individual trace URLs)
diff --git a/hud/eval/types.py b/hud/eval/types.py
index ecc307ec..1d43926e 100644
--- a/hud/eval/types.py
+++ b/hud/eval/types.py
@@ -54,7 +54,7 @@ class JobEnterPayload(BaseModel):
     name: str | None = None
     variants: dict[str, Any] | None = None  # Full variant config
     group: int | None = None
-    taskset: str | None = None  # taskset slug to associate job with
+    taskset_id: str | None = None  # evalset UUID to associate job with
     hud_eval_config: dict[str, Any] | None = None  # replayable hud eval config (no secrets)
 
 

From 14855219cfeba3c0a4b3854a91b9eb5fb197ba93 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 27 Feb 2026 11:36:12 -0800
Subject: [PATCH 03/14] remove hf

---
 hud/cli/eval.py          |  2 +-
 hud/datasets/__init__.py |  2 +-
 hud/datasets/loader.py   | 95 ++++------------------------------------
 pyproject.toml           |  2 -
 4 files changed, 10 insertions(+), 91 deletions(-)

diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index 5ae8805a..459eeabc 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -753,7 +753,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
 
 
 def eval_command(
-    source: str | None = typer.Argument(None, help="HuggingFace dataset or task JSON file"),
+    source: str | None = typer.Argument(None, help="Taskset slug or task JSON file"),
     agent: str | None = typer.Argument(
         None,
         help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test",  # noqa: E501
diff --git a/hud/datasets/__init__.py b/hud/datasets/__init__.py
index 6bf88851..8d4cebfc 100644
--- a/hud/datasets/__init__.py
+++ b/hud/datasets/__init__.py
@@ -3,7 +3,7 @@
 Provides unified task loading, saving, and execution for HUD evaluations.
 
 Key functions:
-- load_tasks(): Load tasks from JSON, JSONL, HuggingFace, or HUD API
+- load_tasks(): Load tasks from JSON, JSONL, or HUD API
 - save_tasks(): Save tasks to the HUD API
 - run_dataset(): Run an agent on a dataset of tasks
 - submit_rollouts(): Submit tasks for remote execution
diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py
index 48b6d8b0..e7be7bbb 100644
--- a/hud/datasets/loader.py
+++ b/hud/datasets/loader.py
@@ -3,7 +3,6 @@
 Unified interface for loading evaluation tasks from:
 - HUD API (v5 format)
 - Local JSON/JSONL files (v4 LegacyTask format, auto-converted)
-- HuggingFace datasets (v4 LegacyTask format, auto-converted)
 """
 
 from __future__ import annotations
@@ -71,43 +70,6 @@ def _load_from_file(path: Path) -> list[Task]:
     return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
 
 
-def _load_raw_from_huggingface(dataset_name: str) -> list[dict[str, Any]]:
-    """Load raw task dicts from HuggingFace dataset."""
-    try:
-        from datasets import load_dataset as hf_load_dataset
-    except ImportError as e:
-        raise ImportError(
-            "Please install 'datasets' to load from HuggingFace: uv pip install datasets"
-        ) from e
-
-    # Parse dataset name and optional split
-    if ":" in dataset_name:
-        name, split = dataset_name.split(":", 1)
-    else:
-        name = dataset_name
-        split = "train"  # Default split
-
-    logger.info("Loading from HuggingFace dataset: %s (split=%s)", name, split)
-    dataset = hf_load_dataset(name, split=split)
-
-    raw_items: list[dict[str, Any]] = []
-    for item in dataset:
-        if not isinstance(item, dict):
-            raise ValueError(f"Invalid HuggingFace dataset: expected dict, got {type(item)}")
-        raw_items.append(dict(item))
-
-    return raw_items
-
-
-def _load_from_huggingface(dataset_name: str) -> list[Task]:
-    """Load tasks from HuggingFace dataset."""
-    raw_items = _load_raw_from_huggingface(dataset_name)
-    from hud.eval.task import Task
-
-    # Default args to {} for runnable tasks (None = template)
-    return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
-
-
 def _load_raw_from_api(dataset_name: str) -> tuple[list[dict[str, Any]], str | None]:
     """Load raw task dicts from HUD API.
 
@@ -165,7 +127,6 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str,
     Supports multiple sources with auto-detection:
     - Local file path (JSON or JSONL)
     - HUD API dataset slug (e.g., "hud-evals/SheetBench-50")
-    - HuggingFace dataset (e.g., "username/dataset" or "username/dataset:split")
 
     Automatically detects and converts v4 LegacyTask format to v5 Task.
 
@@ -173,7 +134,6 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str,
         source: Task source. Can be:
             - Path to a local JSON/JSONL file
             - HUD API dataset slug (e.g., "hud-evals/SheetBench-50")
-            - HuggingFace dataset name (e.g., "hud-evals/tasks" or "hud-evals/tasks:train")
         raw: If True, return raw dicts without validation or env var substitution.
             Useful for preserving template strings like "${HUD_API_KEY}".
 
@@ -181,28 +141,6 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str,
         - If raw=False (default): list[Task] ready to use with hud.eval()
         - If raw=True: list[dict] with raw task data
 
-    Example:
-        ```python
-        import hud
-        from hud.datasets import load_tasks
-
-        # Load from HUD API
-        tasks = load_tasks("hud-evals/SheetBench-50")
-
-        # Load from local file (v4 format auto-converted)
-        tasks = load_tasks("./my-tasks.json")
-
-        # Load from HuggingFace
-        tasks = load_tasks("hud-evals/benchmark:test")
-
-        # Load raw dicts (preserves env var placeholders)
-        raw_tasks = load_tasks("./tasks.json", raw=True)
-
-        # Run evaluation
-        async with hud.eval(tasks) as ctx:
-            await agent.run(ctx)
-        ```
-
     Raises:
         ValueError: If task loading fails
     """
@@ -214,31 +152,14 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str,
         logger.info("Loaded %d tasks from %s", len(items), source)
         return items
 
-    # Try HUD API first
-    try:
-        logger.info("Trying HUD API: %s", source)
-        if raw:
-            items, _ = _load_raw_from_api(source)
-        else:
-            items = _load_from_api(source)
-        logger.info("Loaded %d tasks from HUD API: %s", len(items), source)
-        return items
-    except Exception as hud_error:
-        logger.debug("HUD API load failed (%s), trying HuggingFace", hud_error)
-
-    # Try HuggingFace as fallback
-    try:
-        logger.info("Trying HuggingFace dataset: %s", source)
-        items = _load_raw_from_huggingface(source) if raw else _load_from_huggingface(source)
-        logger.info("Loaded %d tasks from HuggingFace: %s", len(items), source)
-        return items
-    except ImportError:
-        raise ValueError(
-            f"Failed to load tasks from '{source}'. "
-            "Install 'datasets' package for HuggingFace support."
-        ) from None
-    except Exception as hf_error:
-        raise ValueError(f"Failed to load tasks from '{source}': {hf_error}") from hf_error
+    # Try HUD API
+    logger.info("Trying HUD API: %s", source)
+    if raw:
+        items, _ = _load_raw_from_api(source)
+    else:
+        items = _load_from_api(source)
+    logger.info("Loaded %d tasks from HUD API: %s", len(items), source)
+    return items
 
 
 def save_tasks(
diff --git a/pyproject.toml b/pyproject.toml
index b38d291f..910ae2ee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -113,8 +113,6 @@ agents = [
     "anthropic>=0.78.0",
     "google-genai",
     "openai-agents",
-    # Dataset loading (HuggingFace)
-    "datasets>=2.14.0",
     # Image processing for screenshots/grounding
     "pillow>=11.1.0",
     # Jupyter kernel support

From b853aed8b1f8e7762b446a7d8b308e5d7c5bb286 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 27 Feb 2026 12:10:11 -0800
Subject: [PATCH 04/14] simplify

---
 hud/cli/eval.py        | 39 +++++++++++++++++++++++++++++----------
 hud/datasets/loader.py | 14 +++++++-------
 hud/datasets/runner.py |  9 +++++----
 hud/eval/manager.py    | 39 +++++++++++----------------------------
 4 files changed, 52 insertions(+), 49 deletions(-)

diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index 459eeabc..0d3ee4ee 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -609,22 +609,27 @@ def display(self) -> None:
 
 async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
     """Run evaluation with the given config using run_dataset()."""
-    from hud.datasets import load_tasks, run_dataset
+    from pathlib import Path
+
+    from hud.datasets import run_dataset
+    from hud.datasets.loader import _load_from_api, _load_from_file
 
     if cfg.source is None or cfg.agent_type is None:
         raise ValueError("source and agent_type must be set")
 
-    # Load tasks using unified loader (handles v4→v5 conversion automatically)
+    # Load tasks — use internal loaders to capture taskset_id from API sources
     hud_console.info(f"📊 Loading tasks from: {cfg.source}…")
-    tasks = load_tasks(cfg.source)
+    path = Path(cfg.source)
+    taskset_id: str | None = None
+    if path.exists() and path.suffix in {".json", ".jsonl"}:
+        tasks = _load_from_file(path)
+    else:
+        tasks, taskset_id = _load_from_api(cfg.source)
 
     if not tasks:
         hud_console.error(f"No tasks found in: {cfg.source}")
         raise typer.Exit(1)
 
-    # Extract taskset_id from API-loaded tasks (set by loader in metadata)
-    taskset_id: str | None = tasks[0].metadata.get("taskset_id") if tasks else None
-
     # Filter by task slugs (or positional indices) if provided
     if cfg.task_ids:
         selector_set = set(cfg.task_ids)
@@ -653,15 +658,16 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
 
     max_steps = cfg.max_steps
 
+    import uuid
+
+    from hud.eval.manager import _send_job_enter
+
     # Remote execution - submit to HUD platform
     if cfg.remote:
         agent_kwargs = {
             k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
         }
-        import uuid
-
         from hud.datasets.utils import submit_rollouts
-        from hud.eval.manager import _send_job_enter
 
         job_id = str(uuid.uuid4())
         hud_console.info(
@@ -728,6 +734,19 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
             f"group_size: {cfg.group_size})…"
         )
 
+    # Register job with taskset association if tasks came from API
+    job_id: str | None = None
+    if taskset_id:
+        job_id = str(uuid.uuid4())
+        await _send_job_enter(
+            job_id=job_id,
+            name=f"eval ({cfg.source})" if cfg.source else "eval",
+            variants=None,
+            group=cfg.group_size,
+            api_key=None,
+            taskset_id=taskset_id,
+        )
+
     # Run using run_dataset
     results = await run_dataset(
         tasks,
@@ -737,7 +756,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
         max_concurrent=cfg.max_concurrent,
         group_size=cfg.group_size,
         quiet=cfg.quiet,
-        taskset_id=taskset_id,
+        job_id=job_id,
     )
 
     # Show reward for single task
diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py
index e7be7bbb..ebc0adde 100644
--- a/hud/datasets/loader.py
+++ b/hud/datasets/loader.py
@@ -101,16 +101,16 @@ def _load_raw_from_api(dataset_name: str) -> tuple[list[dict[str, Any]], str | N
         return tasks, taskset_id
 
 
-def _load_from_api(dataset_name: str) -> list[Task]:
-    """Load tasks from HUD API."""
+def _load_from_api(dataset_name: str) -> tuple[list[Task], str | None]:
+    """Load tasks from HUD API.
+
+    Returns (tasks, taskset_id) tuple.
+    """
     from hud.eval.task import Task
 
     raw_items, taskset_id = _load_raw_from_api(dataset_name)
     tasks = [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
-    if taskset_id:
-        for task in tasks:
-            task.metadata["taskset_id"] = taskset_id
-    return tasks
+    return tasks, taskset_id
 
 
 @overload
@@ -157,7 +157,7 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str,
     if raw:
         items, _ = _load_raw_from_api(source)
     else:
-        items = _load_from_api(source)
+        items, _ = _load_from_api(source)
     logger.info("Loaded %d tasks from HUD API: %s", len(items), source)
     return items
 
diff --git a/hud/datasets/runner.py b/hud/datasets/runner.py
index a34c0a76..d0244636 100644
--- a/hud/datasets/runner.py
+++ b/hud/datasets/runner.py
@@ -29,8 +29,7 @@ async def run_dataset(
     max_concurrent: int = 30,
     group_size: int = 1,
     quiet: bool = True,
-    taskset: str | None = None,
-    taskset_id: str | None = None,
+    job_id: str | None = None,
 ) -> list[EvalContext]:
     """Run an agent on a dataset of tasks.
 
@@ -48,6 +47,9 @@ async def run_dataset(
         max_concurrent: Maximum concurrent tasks (for parallel execution).
         group_size: Number of times to run each task (for variance estimation).
         quiet: Whether to suppress printing eval links and opening browser (default True).
+        job_id: Pre-registered job ID. If provided, traces are grouped under this job
+            and no implicit job is created. If None, a job is created automatically
+            for parallel execution.
 
     Returns:
         List of EvalContext results from each task execution. Access `.reward` on each.
@@ -98,8 +100,7 @@ async def run_dataset(
         group=group_size,
         max_concurrent=max_concurrent,
         quiet=quiet,
-        taskset=taskset,
-        taskset_id=taskset_id,
+        job_id=job_id,
     ) as ctx:
         # Build agent params - use system_prompt from ctx (set from task.agent_config)
         final_agent_params = dict(agent_params or {})
diff --git a/hud/eval/manager.py b/hud/eval/manager.py
index 4b3ffcd9..0908738e 100644
--- a/hud/eval/manager.py
+++ b/hud/eval/manager.py
@@ -120,8 +120,6 @@ async def run_eval(
     max_concurrent: int | None = None,
     trace: bool = True,
     quiet: bool = False,
-    taskset: str | None = None,
-    taskset_id: str | None = None,
 ) -> AsyncGenerator[EvalContext, None]:
     """Standalone eval context manager.
 
@@ -139,7 +137,7 @@ async def run_eval(
         variants: A/B test configuration (dict with list values expanded)
         group: Runs per variant for statistical significance
         group_ids: Optional list of group IDs
-        job_id: Job ID to link to
+        job_id: Pre-registered job ID. Skips implicit job creation if provided.
         group_id: Group ID for parallel evaluations
         trace_id: Pre-assigned trace ID (auto-generated if not provided)
         api_key: API key for backend calls
@@ -252,28 +250,13 @@ async def run_eval(
 
     if total_evals == 1:
         if tasks:
-            job_id_for_run = job_id
-            if taskset or taskset_id:
-                eval_name = _get_eval_name(tasks=tasks, group=group)
-                if job_id_for_run is None:
-                    job_id_for_run = str(uuid.uuid4())
-
-                await _send_job_enter(
-                    job_id=job_id_for_run,
-                    name=eval_name,
-                    variants=variants,
-                    group=group,
-                    api_key=api_key,
-                    taskset_id=taskset_id,
-                )
-
             # Single task - use EvalContext.from_task()
             ctx = EvalContext.from_task(
                 tasks[0],
                 name=name,
                 trace_id=trace_id,
                 api_key=api_key,
-                job_id=job_id_for_run,
+                job_id=job_id,
                 group_id=group_id,
                 variants=variant_combos[0],
                 code_snippet=code_snippet,
@@ -304,15 +287,15 @@ async def run_eval(
         implicit_job_id = job_id or str(uuid.uuid4())
         job_url = f"https://hud.ai/jobs/{implicit_job_id}"
 
-        # Send job enter (sync request before traces start)
-        await _send_job_enter(
-            job_id=implicit_job_id,
-            name=eval_name,
-            variants=variants,
-            group=group,
-            api_key=api_key,
-            taskset_id=taskset_id,
-        )
+        # Register job if not already provided by caller
+        if not job_id:
+            await _send_job_enter(
+                job_id=implicit_job_id,
+                name=eval_name,
+                variants=variants,
+                group=group,
+                api_key=api_key,
+            )
 
         # Print job URL (not individual trace URLs)
         if not quiet:

From 97426adf7a1efbb368c234567920260fdbee7735 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 27 Feb 2026 17:11:43 -0800
Subject: [PATCH 05/14] align eval naming

---
 hud/cli/eval.py        | 19 +++----------------
 hud/datasets/runner.py |  3 +++
 hud/eval/manager.py    |  3 +++
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index 0d3ee4ee..bb386245 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -660,7 +660,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
 
     import uuid
 
-    from hud.eval.manager import _send_job_enter
+    from hud.eval.manager import _get_eval_name, _send_job_enter
 
     # Remote execution - submit to HUD platform
     if cfg.remote:
@@ -696,7 +696,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
 
         await _send_job_enter(
             job_id=job_id,
-            name=f"eval ({cfg.source})" if cfg.source else "eval",
+            name=_get_eval_name(tasks=tasks, group=cfg.group_size),
             variants=None,
             group=cfg.group_size,
             api_key=None,
@@ -734,19 +734,6 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
             f"group_size: {cfg.group_size})…"
         )
 
-    # Register job with taskset association if tasks came from API
-    job_id: str | None = None
-    if taskset_id:
-        job_id = str(uuid.uuid4())
-        await _send_job_enter(
-            job_id=job_id,
-            name=f"eval ({cfg.source})" if cfg.source else "eval",
-            variants=None,
-            group=cfg.group_size,
-            api_key=None,
-            taskset_id=taskset_id,
-        )
-
     # Run using run_dataset
     results = await run_dataset(
         tasks,
@@ -756,7 +743,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
         max_concurrent=cfg.max_concurrent,
         group_size=cfg.group_size,
         quiet=cfg.quiet,
-        job_id=job_id,
+        taskset_id=taskset_id,
     )
 
     # Show reward for single task
diff --git a/hud/datasets/runner.py b/hud/datasets/runner.py
index d0244636..f6d21b60 100644
--- a/hud/datasets/runner.py
+++ b/hud/datasets/runner.py
@@ -30,6 +30,7 @@ async def run_dataset(
     group_size: int = 1,
     quiet: bool = True,
     job_id: str | None = None,
+    taskset_id: str | None = None,
 ) -> list[EvalContext]:
     """Run an agent on a dataset of tasks.
 
@@ -50,6 +51,7 @@ async def run_dataset(
         job_id: Pre-registered job ID. If provided, traces are grouped under this job
             and no implicit job is created. If None, a job is created automatically
             for parallel execution.
+        taskset_id: Taskset UUID to associate the job with on the platform.
 
     Returns:
         List of EvalContext results from each task execution. Access `.reward` on each.
@@ -101,6 +103,7 @@ async def run_dataset(
         max_concurrent=max_concurrent,
         quiet=quiet,
         job_id=job_id,
+        taskset_id=taskset_id,
     ) as ctx:
         # Build agent params - use system_prompt from ctx (set from task.agent_config)
         final_agent_params = dict(agent_params or {})
diff --git a/hud/eval/manager.py b/hud/eval/manager.py
index 0908738e..5e535520 100644
--- a/hud/eval/manager.py
+++ b/hud/eval/manager.py
@@ -118,6 +118,7 @@ async def run_eval(
     trace_id: str | None = None,
     api_key: str | None = None,
     max_concurrent: int | None = None,
+    taskset_id: str | None = None,
     trace: bool = True,
     quiet: bool = False,
 ) -> AsyncGenerator[EvalContext, None]:
@@ -142,6 +143,7 @@ async def run_eval(
         trace_id: Pre-assigned trace ID (auto-generated if not provided)
         api_key: API key for backend calls
         max_concurrent: Maximum concurrent evals (None = unlimited)
+        taskset_id: Taskset UUID to associate the job with on the platform.
         trace: Whether to send trace data to backend (default True)
         quiet: Whether to suppress printing links (default False)
 
@@ -295,6 +297,7 @@ async def run_eval(
                 variants=variants,
                 group=group,
                 api_key=api_key,
+                taskset_id=taskset_id,
             )
 
         # Print job URL (not individual trace URLs)

From 3d9daeec72cb1a6bfe00c57e49ca6be224a963f2 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 27 Feb 2026 17:22:01 -0800
Subject: [PATCH 06/14] align eval names

---
 hud/eval/manager.py | 38 +++++++++++++-------------------------
 1 file changed, 13 insertions(+), 25 deletions(-)

diff --git a/hud/eval/manager.py b/hud/eval/manager.py
index 5e535520..f2d793bc 100644
--- a/hud/eval/manager.py
+++ b/hud/eval/manager.py
@@ -31,38 +31,26 @@
 
 
 def _get_eval_name(tasks: list[Task] | None = None, group: int = 1) -> str:
-    """Extract a nice name for job display.
+    """Build a job display name.
 
-    Args:
-        tasks: List of Task objects
-        group: Group size (runs per task)
-
-    Returns:
-        Name like "scenario (group=5)" for single task or "eval (50 tasks)" for batch
+    Convention:
+        1 task, group=1:  "Task Run: {scenario}"
+        1 task, group>1:  "Task Run: {scenario} (4 times)"
+        N tasks, group=1: "Batch Run: N tasks"
+        N tasks, group>1: "Batch Run: N tasks (4 times)"
     """
+    suffix = f" ({group} times)" if group > 1 else ""
+
     if not tasks:
-        return "eval"
+        return f"Task Run: eval{suffix}"
 
-    # Single task: use scenario/env name
     if len(tasks) == 1:
-        name = None
-        if tasks[0].scenario:
-            name = tasks[0].scenario
-        elif tasks[0].env and hasattr(tasks[0].env, "name"):
+        name = tasks[0].scenario
+        if not name and tasks[0].env and hasattr(tasks[0].env, "name"):
             name = tasks[0].env.name
+        return f"Task Run: {name or 'eval'}{suffix}"
 
-        if name:
-            if group > 1:
-                return f"{name} (group={group})"
-            return name
-        return "eval"
-
-    # Batch: use generic name with count
-    parts = [f"{len(tasks)} tasks"]
-    if group > 1:
-        parts.append(f"group={group}")
-
-    return f"eval ({', '.join(parts)})"
+    return f"Batch Run: {len(tasks)} tasks{suffix}"
 
 
 async def _send_job_enter(

From 7054a26444f30e476a74f886e7d5c93c09e119c7 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 27 Feb 2026 18:48:05 -0800
Subject: [PATCH 07/14] update tests

---
 hud/datasets/tests/test_loader.py   | 32 ++++++++++++++++-------------
 hud/tests/test_datasets_extended.py |  3 ++-
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/hud/datasets/tests/test_loader.py b/hud/datasets/tests/test_loader.py
index 384e3c6e..5c8a92ea 100644
--- a/hud/datasets/tests/test_loader.py
+++ b/hud/datasets/tests/test_loader.py
@@ -137,45 +137,49 @@ def test_load_tasks_no_api_key(
 
     @patch("hud.datasets.loader.httpx.Client")
     @patch("hud.datasets.loader.settings")
-    def test_load_tasks_http_error(
+    def test_load_tasks_taskset_not_found(
         self, mock_settings: MagicMock, mock_client_class: MagicMock
     ) -> None:
-        """load_tasks() raises ValueError on HTTP error."""
+        """load_tasks() raises HTTPStatusError when taskset doesn't exist."""
         import httpx
 
         mock_settings.hud_api_url = "https://api.hud.ai"
         mock_settings.api_key = "test_key"
 
+        mock_response = MagicMock()
+        mock_response.status_code = 404
+        mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
+            "Not Found", request=MagicMock(), response=mock_response
+        )
+
         mock_client = MagicMock()
-        mock_client.get.side_effect = httpx.HTTPError("Network error")
+        mock_client.get.return_value = mock_response
         mock_client.__enter__.return_value = mock_client
         mock_client.__exit__.return_value = None
         mock_client_class.return_value = mock_client
 
-        with pytest.raises(ValueError, match="Failed to load tasks"):
-            load_tasks("test-org/test-dataset")
+        with pytest.raises(httpx.HTTPStatusError):
+            load_tasks("nonexistent-taskset")
 
     @patch("hud.datasets.loader.httpx.Client")
     @patch("hud.datasets.loader.settings")
-    def test_load_tasks_json_error(
+    def test_load_tasks_network_error(
         self, mock_settings: MagicMock, mock_client_class: MagicMock
     ) -> None:
-        """load_tasks() raises ValueError on JSON processing error."""
+        """load_tasks() raises ConnectError when API is unreachable."""
+        import httpx
+
         mock_settings.hud_api_url = "https://api.hud.ai"
         mock_settings.api_key = "test_key"
 
-        mock_response = MagicMock()
-        mock_response.json.side_effect = Exception("Invalid JSON")
-        mock_response.raise_for_status = MagicMock()
-
         mock_client = MagicMock()
-        mock_client.get.return_value = mock_response
+        mock_client.get.side_effect = httpx.ConnectError("Connection refused")
         mock_client.__enter__.return_value = mock_client
         mock_client.__exit__.return_value = None
         mock_client_class.return_value = mock_client
 
-        with pytest.raises(ValueError, match="Failed to load tasks"):
-            load_tasks("test-org/test-dataset")
+        with pytest.raises(httpx.ConnectError):
+            load_tasks("my-taskset")
 
     @patch("hud.datasets.loader.httpx.Client")
     @patch("hud.datasets.loader.settings")
diff --git a/hud/tests/test_datasets_extended.py b/hud/tests/test_datasets_extended.py
index 3a870aaa..67b23a8c 100644
--- a/hud/tests/test_datasets_extended.py
+++ b/hud/tests/test_datasets_extended.py
@@ -237,5 +237,6 @@ async def test_run_dataset_passes_parameters(self):
                 group=3,
                 max_concurrent=10,
                 quiet=True,
-                taskset=None,
+                job_id=None,
+                taskset_id=None,
             )

From b296c43bdaa1d796e9ea8e877680d17c5046e40a Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 27 Feb 2026 22:17:18 -0800
Subject: [PATCH 08/14] validation for subscores

---
 hud/tools/tests/test_types.py | 77 +++++++++++++++++++++++++++++++++++
 hud/tools/types.py            | 32 +++++++++++++--
 2 files changed, 106 insertions(+), 3 deletions(-)

diff --git a/hud/tools/tests/test_types.py b/hud/tools/tests/test_types.py
index daba05a4..157be404 100644
--- a/hud/tools/tests/test_types.py
+++ b/hud/tools/tests/test_types.py
@@ -437,3 +437,80 @@ def test_evaluation_result_isError_flag():
 
     assert result.isError is True
     assert result.reward == 0.0
+
+
+# Tests for SubScore and EvaluationResult validators
+
+
+def test_subscore_value_range_rejected():
+    """Test SubScore rejects values outside [0, 1]."""
+    from pydantic import ValidationError
+
+    with pytest.raises(ValidationError):
+        SubScore(name="test", value=-0.1)
+    with pytest.raises(ValidationError):
+        SubScore(name="test", value=1.5)
+
+
+def test_check_subscores_duplicate_names_warns():
+    """Test duplicate subscore names produce a warning."""
+    import warnings
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        EvaluationResult(
+            reward=0.5,
+            subscores=[
+                SubScore(name="accuracy", weight=0.5, value=0.5),
+                SubScore(name="accuracy", weight=0.5, value=0.5),
+            ],
+        )
+    assert any("Duplicate subscore names" in str(x.message) for x in w)
+
+
+def test_check_subscores_weights_not_summing_to_one_warns():
+    """Test positive weights not summing to ~1.0 produce a warning."""
+    import warnings
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        EvaluationResult(
+            reward=0.75,
+            subscores=[
+                SubScore(name="a", weight=0.5, value=1.0),
+                SubScore(name="b", weight=0.25, value=1.0),
+            ],
+        )
+    assert any("Positive subscore weights should sum to ~1.0" in str(x.message) for x in w)
+
+
+def test_check_subscores_reward_mismatch_warns():
+    """Test weighted sum not matching reward produces a warning."""
+    import warnings
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        EvaluationResult(
+            reward=0.5,
+            subscores=[SubScore(name="a", weight=1.0, value=0.8)],
+        )
+    assert any("Subscores don't match reward" in str(x.message) for x in w)
+
+
+def test_check_subscores_valid_with_negative_weights():
+    """Test valid subscores with negative weights produce no warnings."""
+    import warnings
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        # Positive: 0.6 + 0.4 = 1.0
+        # Weighted sum: 0.6*1.0 + 0.4*0.5 + (-0.2)*1.0 = 0.6
+        EvaluationResult(
+            reward=0.6,
+            subscores=[
+                SubScore(name="quality", weight=0.6, value=1.0),
+                SubScore(name="speed", weight=0.4, value=0.5),
+                SubScore(name="penalty", weight=-0.2, value=1.0),
+            ],
+        )
+    assert len(w) == 0
diff --git a/hud/tools/types.py b/hud/tools/types.py
index d4d0aa16..83c741bc 100644
--- a/hud/tools/types.py
+++ b/hud/tools/types.py
@@ -3,7 +3,9 @@
 from typing import Any
 
 from mcp.types import ContentBlock, ImageContent, TextContent
-from pydantic import BaseModel, ConfigDict, Field
+import warnings
+
+from pydantic import BaseModel, ConfigDict, Field, model_validator
 
 
 class Coordinate(BaseModel):
@@ -36,8 +38,8 @@ class SubScore(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
     name: str = Field(..., description="Name of this subscore component")
-    weight: float = Field(default=1.0, description="Weight of this subscore (for weighted average)")
-    value: float = Field(..., description="Value of this subscore, usually 0.0 to 1.0")
+    weight: float = Field(default=1.0, description="Weight of this subscore (for weighted average). Negative weights represent penalties.")
+    value: float = Field(..., ge=0.0, le=1.0, description="Value of this subscore, 0.0 to 1.0")
     metadata: dict[str, Any] | None = Field(default=None, exclude=True)
 
     @property
@@ -76,6 +78,30 @@ class EvaluationResult(BaseModel):
 
     model_config = ConfigDict(extra="allow")
 
+    @model_validator(mode="after")
+    def _check_subscores(self) -> EvaluationResult:
+        if not self.subscores:
+            return self
+        names = [s.name for s in self.subscores]
+        dupes = [n for n in names if names.count(n) > 1]
+        if dupes:
+            warnings.warn(f"Duplicate subscore names: {set(dupes)}", stacklevel=2)
+        pos_weight_sum = sum(s.weight for s in self.subscores if s.weight > 0)
+        if abs(pos_weight_sum - 1.0) > 0.01:
+            warnings.warn(
+                f"Positive subscore weights should sum to ~1.0 (got {pos_weight_sum:.4f}). "
+                f"Weights represent proportional contributions to the reward.",
+                stacklevel=2,
+            )
+        weighted_sum = sum(s.value * s.weight for s in self.subscores)
+        if abs(weighted_sum - self.reward) > 0.01:
+            warnings.warn(
+                f"Subscores don't match reward: "
+                f"sum(value*weight)={weighted_sum:.4f} but reward={self.reward:.4f}",
+                stacklevel=2,
+            )
+        return self
+
     @classmethod
     def from_float(cls, value: float) -> EvaluationResult:
         """Create an EvaluationResult from a simple float reward.

From 5f2af134bb1b3c7ddbd8a896b64f08b31daceea5 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 27 Feb 2026 22:18:19 -0800
Subject: [PATCH 09/14] ruff

---
 hud/tools/types.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/hud/tools/types.py b/hud/tools/types.py
index 83c741bc..7e014cfa 100644
--- a/hud/tools/types.py
+++ b/hud/tools/types.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
+import warnings
 from typing import Any
 
 from mcp.types import ContentBlock, ImageContent, TextContent
-import warnings
-
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 
 
@@ -38,7 +37,11 @@ class SubScore(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
     name: str = Field(..., description="Name of this subscore component")
-    weight: float = Field(default=1.0, description="Weight of this subscore (for weighted average). Negative weights represent penalties.")
+    weight: float = Field(
+        default=1.0,
+        description="Weight of this subscore (for weighted average). "
+        "Negative weights represent penalties.",
+    )
     value: float = Field(..., ge=0.0, le=1.0, description="Value of this subscore, 0.0 to 1.0")
     metadata: dict[str, Any] | None = Field(default=None, exclude=True)
 

From 437c79c544e4ae3beb370caf413b3714824451a9 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 27 Feb 2026 22:19:56 -0800
Subject: [PATCH 10/14] bump version to 0.5.29

---
 hud/cli/tests/test_build.py     | 4 ++--
 hud/utils/tests/test_version.py | 2 +-
 hud/version.py                  | 2 +-
 pyproject.toml                  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/hud/cli/tests/test_build.py b/hud/cli/tests/test_build.py
index c6ebadab..94cf51c0 100644
--- a/hud/cli/tests/test_build.py
+++ b/hud/cli/tests/test_build.py
@@ -60,12 +60,12 @@ def test_increment_patch(self):
     def test_increment_minor(self):
         """Test incrementing minor version."""
         assert increment_version("1.2.3", "minor") == "1.3.0"
-        assert increment_version("0.5.28", "minor") == "0.6.0"
+        assert increment_version("0.5.29", "minor") == "0.6.0"
 
     def test_increment_major(self):
         """Test incrementing major version."""
         assert increment_version("1.2.3", "major") == "2.0.0"
-        assert increment_version("0.5.28", "major") == "1.0.0"
+        assert increment_version("0.5.29", "major") == "1.0.0"
 
     def test_increment_with_v_prefix(self):
         """Test incrementing version with v prefix."""
diff --git a/hud/utils/tests/test_version.py b/hud/utils/tests/test_version.py
index ada8b553..16c8ade8 100644
--- a/hud/utils/tests/test_version.py
+++ b/hud/utils/tests/test_version.py
@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
 
-    assert hud.__version__ == "0.5.28"
+    assert hud.__version__ == "0.5.29"
diff --git a/hud/version.py b/hud/version.py
index 7f525011..c7b914db 100644
--- a/hud/version.py
+++ b/hud/version.py
@@ -4,4 +4,4 @@
 
 from __future__ import annotations
 
-__version__ = "0.5.28"
+__version__ = "0.5.29"
diff --git a/pyproject.toml b/pyproject.toml
index 910ae2ee..55999bcd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "hud-python"
-version = "0.5.28"
+version = "0.5.29"
 description = "SDK for the HUD platform."
 readme = "README.md"
 requires-python = ">=3.11, <3.13"

From 305290d892f3872bb31798ee7da7781d3508d777 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 27 Feb 2026 23:43:54 -0800
Subject: [PATCH 11/14] enhance error handling and job registration in
 evaluation process

---
 hud/cli/eval.py     | 15 +++++++++----
 hud/eval/manager.py | 54 +++++++++++++++++----------------------------
 2 files changed, 31 insertions(+), 38 deletions(-)

diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index bb386245..f0b58768 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -621,15 +621,22 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
     hud_console.info(f"📊 Loading tasks from: {cfg.source}…")
     path = Path(cfg.source)
     taskset_id: str | None = None
-    if path.exists() and path.suffix in {".json", ".jsonl"}:
-        tasks = _load_from_file(path)
-    else:
-        tasks, taskset_id = _load_from_api(cfg.source)
+    try:
+        if path.exists() and path.suffix in {".json", ".jsonl"}:
+            tasks = _load_from_file(path)
+        else:
+            tasks, taskset_id = _load_from_api(cfg.source)
+    except Exception as e:
+        hud_console.error(f"Failed to load tasks from {cfg.source}: {e}")
+        raise typer.Exit(1) from e
 
     if not tasks:
         hud_console.error(f"No tasks found in: {cfg.source}")
         raise typer.Exit(1)
 
+    if cfg.taskset:
+        taskset_id = cfg.taskset
+
     # Filter by task slugs (or positional indices) if provided
     if cfg.task_ids:
         selector_set = set(cfg.task_ids)
diff --git a/hud/eval/manager.py b/hud/eval/manager.py
index f2d793bc..78d552a5 100644
--- a/hud/eval/manager.py
+++ b/hud/eval/manager.py
@@ -238,9 +238,21 @@ async def run_eval(
     # Lazy import to avoid circular dependency
     from hud.eval.context import EvalContext
 
+    # Register job if not already provided by caller
+    eval_name = _get_eval_name(tasks=tasks, group=group)
+    if not job_id and (taskset_id or total_evals > 1):
+        job_id = str(uuid.uuid4())
+        await _send_job_enter(
+            job_id=job_id,
+            name=eval_name,
+            variants=variants,
+            group=group,
+            api_key=api_key,
+            taskset_id=taskset_id,
+        )
+
     if total_evals == 1:
         if tasks:
-            # Single task - use EvalContext.from_task()
             ctx = EvalContext.from_task(
                 tasks[0],
                 name=name,
@@ -256,7 +268,6 @@ async def run_eval(
             async with ctx:
                 yield ctx
         else:
-            # Blank eval - use EvalContext directly
             ctx = EvalContext(
                 name=name or "eval",
                 trace_id=trace_id,
@@ -272,35 +283,19 @@ async def run_eval(
                 yield ctx
 
     else:
-        # Parallel execution: create implicit job to group traces
-        eval_name = _get_eval_name(tasks=tasks, group=group)
-        implicit_job_id = job_id or str(uuid.uuid4())
-        job_url = f"https://hud.ai/jobs/{implicit_job_id}"
-
-        # Register job if not already provided by caller
-        if not job_id:
-            await _send_job_enter(
-                job_id=implicit_job_id,
-                name=eval_name,
-                variants=variants,
-                group=group,
-                api_key=api_key,
-                taskset_id=taskset_id,
-            )
+        job_url = f"https://hud.ai/jobs/{job_id}"
 
-        # Print job URL (not individual trace URLs)
         if not quiet:
             print_link(job_url, f"🚀 {eval_name}")
 
         error_occurred = False
         try:
-            # Run parallel evals with job_id
             completed = await _run_parallel_eval(
                 tasks=tasks,
                 variant_combos=variant_combos,
                 group=group,
                 group_ids=group_ids,
-                job_id=implicit_job_id,  # Propagate job_id to child traces
+                job_id=job_id,
                 api_key=api_key,
                 code_snippet=code_snippet,
                 max_concurrent=max_concurrent,
@@ -308,20 +303,11 @@ async def run_eval(
                 quiet=quiet,
             )
 
-            # Create summary context (no trace, just aggregates results)
-            if tasks:
-                # Create summary from first task
-                ctx = EvalContext(
-                    name=eval_name,  # Use the same smart name
-                    api_key=api_key,
-                    job_id=implicit_job_id,
-                )
-            else:
-                ctx = EvalContext(
-                    name="eval",
-                    api_key=api_key,
-                    job_id=implicit_job_id,
-                )
+            ctx = EvalContext(
+                name=eval_name,
+                api_key=api_key,
+                job_id=job_id,
+            )
 
             ctx._is_summary = True  # Skip trace tracking
             ctx.results = completed

From 0aed66402ad417ff9c524541a716ad3b5d46c9c3 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 27 Feb 2026 23:56:39 -0800
Subject: [PATCH 12/14] tests

---
 hud/eval/tests/test_manager.py | 88 +++++++++++++++++++++++++++++++++-
 1 file changed, 87 insertions(+), 1 deletion(-)

diff --git a/hud/eval/tests/test_manager.py b/hud/eval/tests/test_manager.py
index 9b237382..2afd73a3 100644
--- a/hud/eval/tests/test_manager.py
+++ b/hud/eval/tests/test_manager.py
@@ -7,7 +7,8 @@
 import pytest
 
 from hud.eval.context import EvalContext, get_current_trace_headers
-from hud.eval.manager import run_eval
+from hud.eval.manager import _get_eval_name, run_eval
+from hud.eval.task import Task
 
 
 class TestRunEvalNoArgs:
@@ -150,3 +151,88 @@ async def test_error_tracked_on_exception(self) -> None:
             error_msg = mock_exit.call_args[0][0]
             assert error_msg is not None
             assert "test error" in error_msg
+
+
+class TestGetEvalName:
+    """Tests for _get_eval_name() naming convention."""
+
+    def test_no_tasks(self) -> None:
+        assert _get_eval_name() == "Task Run: eval"
+
+    def test_no_tasks_with_group(self) -> None:
+        assert _get_eval_name(group=4) == "Task Run: eval (4 times)"
+
+    def test_single_task_with_scenario(self) -> None:
+        tasks = [Task(env={"name": "browser"}, scenario="checkout")]
+        assert _get_eval_name(tasks=tasks) == "Task Run: checkout"
+
+    def test_single_task_with_scenario_and_group(self) -> None:
+        tasks = [Task(env={"name": "browser"}, scenario="checkout")]
+        assert _get_eval_name(tasks=tasks, group=4) == "Task Run: checkout (4 times)"
+
+    def test_single_task_no_scenario_uses_env_name(self) -> None:
+        tasks = [Task(env={"name": "my-env"})]
+        assert _get_eval_name(tasks=tasks) == "Task Run: my-env"
+
+    def test_multiple_tasks(self) -> None:
+        tasks = [
+            Task(env={"name": "browser"}, scenario="checkout"),
+            Task(env={"name": "browser"}, scenario="login"),
+        ]
+        assert _get_eval_name(tasks=tasks) == "Batch Run: 2 tasks"
+
+    def test_multiple_tasks_with_group(self) -> None:
+        tasks = [
+            Task(env={"name": "browser"}, scenario="checkout"),
+            Task(env={"name": "browser"}, scenario="login"),
+            Task(env={"name": "browser"}, scenario="search"),
+        ]
+        assert _get_eval_name(tasks=tasks, group=3) == "Batch Run: 3 tasks (3 times)"
+
+
+class TestRunEvalTasksetId:
+    """Tests for taskset_id flow through run_eval."""
+
+    @pytest.mark.asyncio
+    async def test_taskset_id_triggers_job_registration(self) -> None:
+        """run_eval(taskset_id=...) registers a job even for single task."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
+            patch("hud.eval.manager._send_job_enter", new_callable=AsyncMock) as mock_enter,
+        ):
+            async with run_eval(taskset_id="ts-123", quiet=True) as ctx:
+                pass
+
+            mock_enter.assert_called_once()
+            call_kwargs = mock_enter.call_args[1]
+            assert call_kwargs["taskset_id"] == "ts-123"
+            assert ctx.job_id == call_kwargs["job_id"]
+
+    @pytest.mark.asyncio
+    async def test_no_taskset_no_job_for_single_task(self) -> None:
+        """run_eval() without taskset_id does not register a job for single task."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
+            patch("hud.eval.manager._send_job_enter", new_callable=AsyncMock) as mock_enter,
+        ):
+            async with run_eval(quiet=True) as ctx:
+                pass
+
+            mock_enter.assert_not_called()
+            assert ctx.job_id is None
+
+    @pytest.mark.asyncio
+    async def test_provided_job_id_skips_registration(self) -> None:
+        """run_eval(job_id=..., taskset_id=...) uses provided job_id without registering."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
+            patch("hud.eval.manager._send_job_enter", new_callable=AsyncMock) as mock_enter,
+        ):
+            async with run_eval(job_id="existing-job", taskset_id="ts-123", quiet=True) as ctx:
+                pass
+
+            mock_enter.assert_not_called()
+            assert ctx.job_id == "existing-job"

From 3ed7330bdfd3a3d0a9ea99d8fb4d574ba4f6dce2 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Sat, 28 Feb 2026 00:15:27 -0800
Subject: [PATCH 13/14] taskset name resolution

---
 hud/cli/eval.py        | 14 +++++++++++---
 hud/datasets/loader.py | 20 ++++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index f0b58768..7a1fb18d 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -186,7 +186,7 @@ class EvalConfig(BaseModel):
     remote: bool = False
     quiet: bool = False  # Suppress opening browser for eval links
     gateway: bool = False  # Use HUD Gateway for LLM API calls
-    taskset: str | None = None  # Taskset slug to associate job with
+    taskset: str | None = None  # Taskset name to associate job with
 
     # Base agent config (these merge with task's agent_config)
     allowed_tools: list[str] | None = None
@@ -634,8 +634,16 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
         hud_console.error(f"No tasks found in: {cfg.source}")
         raise typer.Exit(1)
 
+    # TODO: --taskset with file source should sync local tasks to the platform taskset
+    # (diff, save, then run). For now it just resolves the slug and associates the job.
     if cfg.taskset:
-        taskset_id = cfg.taskset
+        from hud.datasets.loader import resolve_taskset_id
+
+        try:
+            taskset_id = resolve_taskset_id(cfg.taskset)
+        except Exception as e:
+            hud_console.error(f"Failed to resolve taskset '{cfg.taskset}': {e}")
+            raise typer.Exit(1) from e
 
     # Filter by task slugs (or positional indices) if provided
     if cfg.task_ids:
@@ -827,7 +835,7 @@ def eval_command(
         False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
     ),
     taskset: str | None = typer.Option(
-        None, "--taskset", "-t", help="Taskset slug to associate job with"
+        None, "--taskset", "-t", help="Taskset name to associate job with"
     ),
 ) -> None:
     """🚀 Run evaluation on datasets or individual tasks with agents.
diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py
index ebc0adde..63b355ee 100644
--- a/hud/datasets/loader.py
+++ b/hud/datasets/loader.py
@@ -70,6 +70,26 @@ def _load_from_file(path: Path) -> list[Task]:
     return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
 
 
+def resolve_taskset_id(slug: str) -> str:
+    """Resolve a taskset slug/name to its UUID via the HUD API."""
+    headers = {}
+    if settings.api_key:
+        headers["Authorization"] = f"Bearer {settings.api_key}"
+
+    with httpx.Client() as client:
+        response = client.get(
+            f"{settings.hud_api_url}/tasks/evalset/{slug}",
+            headers=headers,
+        )
+        response.raise_for_status()
+        data = response.json()
+
+    evalset_id = data.get("evalset_id")
+    if not evalset_id:
+        raise ValueError(f"Could not resolve taskset '{slug}' — not found or no access")
+    return evalset_id
+
+
 def _load_raw_from_api(dataset_name: str) -> tuple[list[dict[str, Any]], str | None]:
     """Load raw task dicts from HUD API.
 

From 7df816120b34a09755daadbc94420d015b8b0e41 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Sat, 28 Feb 2026 00:37:32 -0800
Subject: [PATCH 14/14] docs update

---
 docs/platform/tasksets.mdx  |  2 +-
 docs/quick-links/evals.mdx  | 11 +++++---
 docs/reference/cli/eval.mdx | 56 +++++++++++++++++++++++++++----------
 docs/reference/evals.mdx    |  1 +
 hud/cli/eval.py             | 25 +++--------------
 hud/datasets/loader.py      |  6 ++--
 hud/datasets/utils.py       |  7 -----
 7 files changed, 59 insertions(+), 49 deletions(-)

diff --git a/docs/platform/tasksets.mdx b/docs/platform/tasksets.mdx
index 3427aab0..70d5df4f 100644
--- a/docs/platform/tasksets.mdx
+++ b/docs/platform/tasksets.mdx
@@ -116,7 +116,7 @@ Run all tasks in a taskset with one click:
 Or run from the CLI:
 
 ```bash
-hud eval my-taskset --model gpt-4o --group-size 10
+hud eval "My Tasks" claude --full --group-size 10
 ```
 
 ## Task Configuration
diff --git a/docs/quick-links/evals.mdx b/docs/quick-links/evals.mdx
index f7e60947..71dffe11 100644
--- a/docs/quick-links/evals.mdx
+++ b/docs/quick-links/evals.mdx
@@ -85,14 +85,17 @@ See [Platform Models](/platform/models) for training details.
 Prefer the command line? Use `hud eval` for running evaluations locally or remotely:
 
 ```bash
-# Run a taskset with a model
-hud eval my-taskset claude --full
+# Run a platform taskset with a model
+hud eval "My Tasks" claude --full
 
 # Run with multiple repeats for variance
-hud eval my-taskset claude --full --group-size 5
+hud eval "My Tasks" claude --full --group-size 5
 
 # Run remotely on HUD infrastructure
-hud eval my-taskset claude --full --remote
+hud eval "My Tasks" claude --full --remote
+
+# Run from a local file, linked to a platform taskset
+hud eval tasks.json claude --full --taskset "My Tasks"
 ```
 
 See [`hud eval` CLI reference](/reference/cli/eval) for all options.
diff --git a/docs/reference/cli/eval.mdx b/docs/reference/cli/eval.mdx
index da2f8219..d10e361f 100644
--- a/docs/reference/cli/eval.mdx
+++ b/docs/reference/cli/eval.mdx
@@ -4,7 +4,7 @@ description: "Run agents on tasks or datasets"
 icon: "robot"
 ---
 
-The `hud eval` command runs an agent on a tasks file or HuggingFace dataset.
+The `hud eval` command runs an agent on a tasks file or platform taskset.
 
 <Note>
 **Local Execution Dependencies**: Running Claude or Gemini agents locally requires additional packages:
@@ -23,7 +23,7 @@ hud eval [SOURCE] [AGENT] [OPTIONS]
 ## Arguments
 
 <ParamField path="source" type="string">
-  HuggingFace dataset (e.g., `hud-evals/SheetBench-50`) or task JSON/JSONL file.
+  Platform taskset name (e.g., `My Tasks`) or local task JSON/JSONL file. When loading from a platform taskset, the job is automatically associated with that taskset.
 </ParamField>
 
 <ParamField path="agent" type="string">
@@ -82,6 +82,18 @@ hud eval [SOURCE] [AGENT] [OPTIONS]
   Use ResponseAgent to decide when to stop/continue. Default: True for `--full`.
 </ParamField>
 
+### Taskset Association
+
+<ParamField path="--taskset" type="string">
+  Taskset name to associate the job with. Resolves the name to a taskset UUID on the platform. Useful when running from a local file but wanting the job to appear under a platform taskset.
+</ParamField>
+
+### LLM Routing
+
+<ParamField path="--gateway" type="boolean" default="false">
+  Route LLM API calls through HUD Gateway. If you have API keys stored on the platform, they're used automatically (BYOK) at a lower credit cost. Otherwise, pooled keys are used.
+</ParamField>
+
 ### Output & Confirmation
 
 <ParamField path="--verbose, -v" type="boolean" default="false">
@@ -92,6 +104,10 @@ hud eval [SOURCE] [AGENT] [OPTIONS]
   Enable debug-level logs.
 </ParamField>
 
+<ParamField path="--quiet" type="boolean" default="false">
+  Suppress opening the browser for eval links.
+</ParamField>
+
 <ParamField path="--yes, -y" type="boolean" default="false">
   Skip confirmation prompt.
 </ParamField>
@@ -107,14 +123,16 @@ On first run, a template is created:
 ```toml
 # .hud_eval.toml
 [eval]
-# source = "hud-evals/SheetBench-50"
+# source = "My Tasks"
 # agent = "claude"
 # full = false
 # max_concurrent = 30
 # max_steps = 10
 # group_size = 1
-# task_ids = ["task_1", "task_2"]
+# task_ids = ["checkout-smoke", "0"]  # slugs or 0-based indices
 # auto_respond = true
+# gateway = false
+# quiet = false
 
 [agent]
 # allowed_tools = ["computer", "playwright"]
@@ -141,11 +159,17 @@ On first run, a template is created:
 ## Examples
 
 ```bash
-# Single task (debug mode)
-hud eval tasks.json claude
+# Run a platform taskset (single task)
+hud eval "My Tasks" claude
+
+# Full taskset evaluation
+hud eval "My Tasks" claude --full
 
-# Full dataset evaluation
-hud eval hud-evals/SheetBench-50 claude --full
+# Run from a local file
+hud eval tasks.json claude --full
+
+# Local file, associated with a platform taskset
+hud eval tasks.json claude --full --taskset "My Tasks"
 
 # Run specific tasks by ID
 hud eval tasks.json claude --task-ids task_1,task_5
@@ -158,13 +182,16 @@ hud eval tasks.json claude --config max_tokens=32768
 hud eval tasks.json openai --config temperature=0.7
 
 # High concurrency
-hud eval hud-evals/SheetBench-50 claude --full --max-concurrent 100
+hud eval "My Tasks" claude --full --max-concurrent 100
 
 # Variance estimation (run each task 3 times)
-hud eval tasks.json claude --full --group-size 3
+hud eval "My Tasks" claude --full --group-size 3
 
 # Remote execution on HUD platform
-hud eval hud-evals/SheetBench-50 claude --full --remote
+hud eval "My Tasks" claude --full --remote
+
+# Route through HUD Gateway (no provider API keys needed)
+hud eval tasks.json claude --full --gateway
 
 # OpenAI-compatible endpoint (vLLM, Ollama, etc.)
 hud eval tasks.json openai_compatible \
@@ -187,8 +214,9 @@ When agent is omitted, an interactive selector shows presets:
 ❯ Claude Sonnet 4.5
   GPT-5
   Operator (OpenAI Computer Use)
-  Gemini 2.5 Computer Use
-  Grok 4.1 Fast
+  Gemini 3 Pro Preview
+  Gemini CUA (Gemini Computer Use)
+  Grok 4-1 Fast (xAI)
 ```
 
 ## Remote Execution
@@ -196,7 +224,7 @@ When agent is omitted, an interactive selector shows presets:
 With `--remote`, both the **agent** and **environment** run on HUD infrastructure:
 
 ```bash
-hud eval hud-evals/SheetBench-50 claude --full --remote
+hud eval "My Tasks" claude --full --remote
 ```
 
 - **Remote agent**: Runs on HUD workers (no local compute needed)
diff --git a/docs/reference/evals.mdx b/docs/reference/evals.mdx
index 0a19a6e1..e2471d92 100644
--- a/docs/reference/evals.mdx
+++ b/docs/reference/evals.mdx
@@ -26,6 +26,7 @@ async with hud.eval() as ctx:
 | `group` | `int` | Runs per variant for statistical significance | `1` |
 | `group_ids` | `list[str] \| None` | Custom group IDs for parallel runs | `None` |
 | `job_id` | `str \| None` | Job ID to link traces to | `None` |
+| `taskset_id` | `str \| None` | Platform taskset UUID to associate the job with | `None` |
 | `api_key` | `str \| None` | API key for backend calls | `None` |
 | `max_concurrent` | `int \| None` | Maximum concurrent evaluations | `None` |
 | `trace` | `bool` | Send telemetry to backend | `True` |
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index 7a1fb18d..9e9db45f 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -96,7 +96,6 @@ class AgentPreset:
 # max_concurrent = 30
 # max_steps = 10
 # group_size = 1
-# byok = false  # Remote only; use encrypted env vars on the platform.
 # task_ids = ["checkout-smoke", "0"]  # slugs or 0-based indices
 # verbose = true
 # very_verbose = true
@@ -160,7 +159,6 @@ class EvalConfig(BaseModel):
         "verbose",
         "very_verbose",
         "group_size",
-        "byok",
         "remote",
         "auto_respond",
         "quiet",
@@ -182,7 +180,6 @@ class EvalConfig(BaseModel):
     very_verbose: bool = False
     auto_respond: bool | None = None  # Continue without prompting
     group_size: int = 1
-    byok: bool = False
     remote: bool = False
     quiet: bool = False  # Suppress opening browser for eval links
     gateway: bool = False  # Use HUD Gateway for LLM API calls
@@ -214,11 +211,6 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None:
 
     def validate_api_keys(self) -> None:
         """Validate required API keys for the selected agent. Raises typer.Exit on failure."""
-        # BYOK requires remote execution (check before agent_type guard)
-        if self.byok and not self.remote:
-            hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
-            raise typer.Exit(1)
-
         if self.agent_type is None:
             return
 
@@ -547,8 +539,6 @@ def display(self) -> None:
             table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
         if self.gateway:
             table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
-        if self.byok:
-            table.add_row("byok", "[bold green]True[/bold green] (remote only)")
 
         # Tool filters (only if set)
         if self.allowed_tools:
@@ -726,7 +716,6 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
             agent_params=agent_kwargs,
             max_steps=max_steps,
             group_size=cfg.group_size,
-            use_byok=cfg.byok,
         )
 
         if not trace_ids:
@@ -823,11 +812,6 @@ def eval_command(
     remote: bool = typer.Option(
         False, "--remote", help="Submit tasks to platform for remote execution"
     ),
-    byok: bool = typer.Option(
-        False,
-        "--byok",
-        help="Remote only: use BYOK keys from encrypted env vars for inference",
-    ),
     quiet: bool = typer.Option(
         False, "--quiet", "-q", help="Suppress opening browser for eval links"
     ),
@@ -842,11 +826,11 @@ def eval_command(
 
     Examples:
         hud eval tasks.json claude
-        hud eval hud-evals/SheetBench-50 claude --full
+        hud eval "My Tasks" claude --full              # Load from platform taskset
+        hud eval tasks.json claude --taskset "My Tasks" # Associate file tasks with taskset
         hud eval tasks.json claude --config max_tokens=32768
-        hud eval tasks.json openai --config temperature=0.7
-        hud eval tasks.json claude --full --remote  # Remote execution
-        hud eval tasks.json claude --gateway  # Route LLM calls through HUD Gateway
+        hud eval tasks.json claude --full --remote     # Remote execution
+        hud eval tasks.json claude --gateway           # Route LLM calls through HUD Gateway
     """
     hud_console.info("🔧 Initializing evaluation...")
 
@@ -877,7 +861,6 @@ def eval_command(
         group_size=group_size,
         config=config,
         remote=remote,
-        byok=byok,
         quiet=quiet,
         gateway=gateway,
         taskset=taskset,
diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py
index 63b355ee..51b2d7df 100644
--- a/hud/datasets/loader.py
+++ b/hud/datasets/loader.py
@@ -22,7 +22,7 @@
 
 logger = logging.getLogger(__name__)
 
-__all__ = ["load_dataset", "load_tasks", "save_tasks"]
+__all__ = ["load_dataset", "load_tasks", "resolve_taskset_id", "save_tasks"]
 
 
 def _load_raw_from_file(path: Path) -> list[dict[str, Any]]:
@@ -162,7 +162,9 @@ def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str,
         - If raw=True: list[dict] with raw task data
 
     Raises:
-        ValueError: If task loading fails
+        httpx.HTTPStatusError: If API returns an error (e.g., 404 for unknown taskset).
+        httpx.ConnectError: If API is unreachable.
+        ValueError: If file format is invalid.
     """
     # Check if it's a local file
     path = Path(source)
diff --git a/hud/datasets/utils.py b/hud/datasets/utils.py
index f218278a..9cd64057 100644
--- a/hud/datasets/utils.py
+++ b/hud/datasets/utils.py
@@ -51,10 +51,6 @@ class SingleTaskRequest(BaseModel):
         description="Additional metadata to inject into the trace context.",
     )
     trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.")
-    use_byok: bool = Field(
-        default=False,
-        description="If True, use BYOK headers from encrypted env vars for inference.",
-    )
 
     @model_validator(mode="after")
     def _validate_task(self) -> SingleTaskRequest:
@@ -125,7 +121,6 @@ async def submit_rollouts(
     group_size: int = 1,
     batch_size: int = 50,
     metadata: dict[str, Any] | None = None,
-    use_byok: bool = False,
 ) -> list[str]:
     """Submit rollouts to the HUD platform API for remote execution.
 
@@ -140,7 +135,6 @@ async def submit_rollouts(
         group_size: Number of rollouts per task (for variance estimation)
         batch_size: Number of rollouts per API batch request
         metadata: Additional metadata for each rollout
-        use_byok: If True, use BYOK keys from encrypted env vars (remote only)
     """
     from hud.eval.utils import is_v4_format
 
@@ -189,7 +183,6 @@ async def submit_rollouts(
                     trace_name=trace_name,
                     group_id=base_task_id if group_size > 1 else None,
                     metadata=metadata or {},
-                    use_byok=use_byok,
                 )
             )