From 9b4f45c79d87fb72266371f4f7917c2bc002e83f Mon Sep 17 00:00:00 2001
From: MagellaX <alphacr792@gmail.com>
Date: Mon, 26 Jan 2026 14:06:45 +0530
Subject: [PATCH 1/5] Add task validation command

---
 hud/cli/__init__.py            | 12 +++++++++
 hud/cli/validate.py            | 48 ++++++++++++++++++++++++++++++++++
 hud/tests/test_validate_cli.py | 43 ++++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+)
 create mode 100644 hud/cli/validate.py
 create mode 100644 hud/tests/test_validate_cli.py

diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py
index 02b788d78..6067cff64 100644
--- a/hud/cli/__init__.py
+++ b/hud/cli/__init__.py
@@ -30,6 +30,7 @@
 from .pull import pull_command
 from .push import push_command
 from .remove import remove_command
+from .validate import validate_command
 from .utils.config import set_env_values
 from .utils.cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
 from .utils.logging import CaptureLogger
@@ -790,6 +791,17 @@ def quickstart() -> None:
     clone("https://github.com/hud-evals/quickstart.git")
 
 
+@app.command()
+def validate(
+    source: str = typer.Argument(  # type: ignore[arg-type]  # noqa: B008
+        ...,
+        help="Tasks file path or dataset slug (e.g. ./tasks.json or hud-evals/SheetBench-50)",
+    ),
+) -> None:
+    """Validate task files or datasets without running them."""
+    validate_command(source)
+
+
 @app.command()
 def eval(
     source: str | None = typer.Argument(
diff --git a/hud/cli/validate.py b/hud/cli/validate.py
new file mode 100644
index 000000000..19fbe2a9e
--- /dev/null
+++ b/hud/cli/validate.py
@@ -0,0 +1,48 @@
+"""Validate task files or datasets."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import typer
+from pydantic import ValidationError
+
+from hud.types import Task
+from hud.utils.tasks import load_tasks
+from hud.utils.hud_console import hud_console
+
+
+def validate_command(source: str) -> None:
+    """Validate tasks from a file or HuggingFace dataset."""
+    try:
+        raw_tasks = load_tasks(source, raw=True)
+    except Exception as e:
+        hud_console.error(f"Failed to load tasks: {e}")
+        raise typer.Exit(1) from e
+
+    errors: list[str] = []
+    for idx, task in enumerate(raw_tasks):
+        label = task.get("id") or f"index {idx}"
+        try:
+            Task(**_as_dict(task))
+        except ValidationError as e:
+            errors.append(f"{label}: {e}")
+        except Exception as e:
+            errors.append(f"{label}: {e}")
+
+    if errors:
+        hud_console.error(f"Found {len(errors)} invalid task(s).")
+        for err in errors:
+            hud_console.error(f"- {err}")
+        raise typer.Exit(1)
+
+    hud_console.success(f"Validated {len(raw_tasks)} task(s).")
+
+
+def _as_dict(task: Any) -> dict[str, Any]:
+    if isinstance(task, dict):
+        return task
+    try:
+        return dict(task)
+    except Exception:
+        return {}
diff --git a/hud/tests/test_validate_cli.py b/hud/tests/test_validate_cli.py
new file mode 100644
index 000000000..cf1f65f2f
--- /dev/null
+++ b/hud/tests/test_validate_cli.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+import importlib.util
+import json
+from pathlib import Path
+
+import pytest
+import typer
+
+
+def _load_validate_command():
+    module_path = Path(__file__).resolve().parents[1] / "cli" / "validate.py"
+    spec = importlib.util.spec_from_file_location("hud.cli.validate", module_path)
+    module = importlib.util.module_from_spec(spec)  # type: ignore[arg-type]
+    assert spec and spec.loader
+    spec.loader.exec_module(module)
+    return module.validate_command
+
+
+def _write_tasks(path: Path, tasks: list[dict]) -> str:
+    path.write_text(json.dumps(tasks), encoding="utf-8")
+    return str(path)
+
+
+def test_validate_command_valid(tmp_path: Path) -> None:
+    validate_command = _load_validate_command()
+    tasks = [
+        {
+            "prompt": "Say hello",
+            "mcp_config": {"local": {"command": "echo", "args": ["hi"]}},
+            "evaluate_tool": {"name": "done", "arguments": {}},
+        }
+    ]
+    path = _write_tasks(tmp_path / "tasks.json", tasks)
+    validate_command(path)
+
+
+def test_validate_command_invalid(tmp_path: Path) -> None:
+    validate_command = _load_validate_command()
+    tasks = [{"mcp_config": {"local": {"command": "echo", "args": ["hi"]}}}]
+    path = _write_tasks(tmp_path / "tasks.json", tasks)
+    with pytest.raises(typer.Exit):
+        validate_command(path)

From edcd13df9d58fa11aac981514da53cb67c2a6dbf Mon Sep 17 00:00:00 2001
From: MagellaX <alphacr792@gmail.com>
Date: Mon, 26 Jan 2026 14:26:29 +0530
Subject: [PATCH 2/5] Validate: flag non-object task entries

---
 hud/cli/validate.py            | 56 +++++++++++++++++++++++++++++++++-
 hud/tests/test_validate_cli.py | 11 +++++++
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/hud/cli/validate.py b/hud/cli/validate.py
index 19fbe2a9e..8b50005c7 100644
--- a/hud/cli/validate.py
+++ b/hud/cli/validate.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+import json
+from pathlib import Path
 from typing import Any
 
 import typer
@@ -15,12 +17,13 @@
 def validate_command(source: str) -> None:
     """Validate tasks from a file or HuggingFace dataset."""
     try:
-        raw_tasks = load_tasks(source, raw=True)
+        raw_tasks, type_errors = _load_raw_tasks(source)
     except Exception as e:
         hud_console.error(f"Failed to load tasks: {e}")
         raise typer.Exit(1) from e
 
     errors: list[str] = []
+    errors.extend(type_errors)
     for idx, task in enumerate(raw_tasks):
         label = task.get("id") or f"index {idx}"
         try:
@@ -46,3 +49,54 @@ def _as_dict(task: Any) -> dict[str, Any]:
         return dict(task)
     except Exception:
         return {}
+
+
+def _load_raw_tasks(source: str) -> tuple[list[dict[str, Any]], list[str]]:
+    path = Path(source)
+    if path.exists() and path.suffix.lower() in {".json", ".jsonl"}:
+        return _load_raw_from_file(path)
+    return load_tasks(source, raw=True), []
+
+
+def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]:
+    errors: list[str] = []
+    items: list[dict[str, Any]] = []
+
+    if path.suffix.lower() == ".jsonl":
+        with open(path, encoding="utf-8") as f:
+            for line_no, line in enumerate(f, start=1):
+                line = line.strip()
+                if not line:
+                    continue
+                value = json.loads(line)
+                if isinstance(value, dict):
+                    items.append(value)
+                    continue
+                if isinstance(value, list):
+                    for idx, entry in enumerate(value):
+                        if isinstance(entry, dict):
+                            items.append(entry)
+                        else:
+                            errors.append(
+                                f"line {line_no} item {idx}: expected object, got {type(entry).__name__}"
+                            )
+                    continue
+                errors.append(
+                    f"line {line_no}: expected object or list, got {type(value).__name__}"
+                )
+        return items, errors
+
+    with open(path, encoding="utf-8") as f:
+        value = json.load(f)
+
+    if isinstance(value, dict):
+        return [value], errors
+    if isinstance(value, list):
+        for idx, entry in enumerate(value):
+            if isinstance(entry, dict):
+                items.append(entry)
+            else:
+                errors.append(f"index {idx}: expected object, got {type(entry).__name__}")
+        return items, errors
+
+    raise ValueError(f"JSON file must contain an object or array, got {type(value).__name__}")
diff --git a/hud/tests/test_validate_cli.py b/hud/tests/test_validate_cli.py
index cf1f65f2f..52d10d335 100644
--- a/hud/tests/test_validate_cli.py
+++ b/hud/tests/test_validate_cli.py
@@ -41,3 +41,14 @@ def test_validate_command_invalid(tmp_path: Path) -> None:
     path = _write_tasks(tmp_path / "tasks.json", tasks)
     with pytest.raises(typer.Exit):
         validate_command(path)
+
+
+def test_validate_command_flags_non_dict_entries(tmp_path: Path) -> None:
+    validate_command = _load_validate_command()
+    tasks = [
+        {"prompt": "ok", "mcp_config": {"local": {"command": "echo", "args": ["hi"]}}},
+        "not a task",
+    ]
+    path = _write_tasks(tmp_path / "tasks.json", tasks)
+    with pytest.raises(typer.Exit):
+        validate_command(path)

From fa8aade041ffa1edda4831ad709c14a5628b8d24 Mon Sep 17 00:00:00 2001
From: MagellaX <alphacr792@gmail.com>
Date: Mon, 26 Jan 2026 15:40:26 +0530
Subject: [PATCH 3/5] Fix eval CLI selection and strict v4 validation

---
 hud/cli/__init__.py | 16 +++++++---------
 hud/cli/validate.py | 12 +++++++++++-
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py
index 5096c068a..0ba34b21e 100644
--- a/hud/cli/__init__.py
+++ b/hud/cli/__init__.py
@@ -21,7 +21,6 @@
 from .debug import debug_mcp_stdio
 from .deploy import deploy_command
 from .dev import run_mcp_dev_server
-from .eval import eval_command
 from .link import link_command
 from .pull import pull_command
 from .push import push_command
@@ -1103,6 +1102,7 @@ def eval(
 
     # If no agent specified, fetch available models and prompt for selection
     base_model = None
+    hud_model_base_map: dict[str, str] = {}
     if agent is None:
         # Get available HUD models first
         hud_models = get_available_models()
@@ -1113,7 +1113,7 @@ def eval(
         # Add HUD models as agent choices
         for hud_model in hud_models:
             model_name = hud_model["name"]
-            base_model = hud_model["base_model"]
+            hud_model_base_map[model_name] = hud_model["base_model"]
             vllm_status = " ⚡" if hud_model.get("vllm_url") else ""
             choices.append({"name": f"{model_name}{vllm_status}", "value": f"{model_name}"})
 
@@ -1139,11 +1139,11 @@ def eval(
 
         # Set model to base model for the vllm endpoint
         if not base_model:
-            hud_models = get_available_models()
-            for hud_model in hud_models:
-                if hud_model["name"] == model:
-                    base_model = hud_model["base_model"]
-                    break
+            if not hud_model_base_map:
+                hud_models = get_available_models()
+                for hud_model in hud_models:
+                    hud_model_base_map[hud_model["name"]] = hud_model["base_model"]
+            base_model = hud_model_base_map.get(model)
         if not base_model:
             hud_console.error(f"Model {model} not found")
             raise typer.Exit(1)
@@ -1176,8 +1176,6 @@ def eval(
         integration_test=integration_test,
     )
 
-app.command(name="eval")(eval_command)
-
 
 
 @app.command()
diff --git a/hud/cli/validate.py b/hud/cli/validate.py
index 8b50005c7..baad9dad5 100644
--- a/hud/cli/validate.py
+++ b/hud/cli/validate.py
@@ -9,8 +9,9 @@
 import typer
 from pydantic import ValidationError
 
+from hud.datasets import load_tasks
+from hud.eval.utils import validate_v4_task
 from hud.types import Task
-from hud.utils.tasks import load_tasks
 from hud.utils.hud_console import hud_console
 
 
@@ -27,6 +28,8 @@ def validate_command(source: str) -> None:
     for idx, task in enumerate(raw_tasks):
         label = task.get("id") or f"index {idx}"
         try:
+            if _looks_like_v4(task):
+                validate_v4_task(task)
             Task(**_as_dict(task))
         except ValidationError as e:
             errors.append(f"{label}: {e}")
@@ -51,6 +54,13 @@ def _as_dict(task: Any) -> dict[str, Any]:
         return {}
 
 
+def _looks_like_v4(task: dict[str, Any]) -> bool:
+    return any(
+        key in task
+        for key in ("prompt", "mcp_config", "evaluate_tool", "setup_tool", "integration_test_tool")
+    )
+
+
 def _load_raw_tasks(source: str) -> tuple[list[dict[str, Any]], list[str]]:
     path = Path(source)
     if path.exists() and path.suffix.lower() in {".json", ".jsonl"}:

From 7e5773888630bc5769bf77d41809f73c8ae3218d Mon Sep 17 00:00:00 2001
From: MagellaX <alphacr792@gmail.com>
Date: Mon, 26 Jan 2026 15:57:51 +0530
Subject: [PATCH 4/5] Restore eval command registration and tighten types

---
 hud/cli/__init__.py | 186 +-------------------------------------------
 hud/cli/validate.py |   4 +-
 2 files changed, 4 insertions(+), 186 deletions(-)

diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py
index 0ba34b21e..f32cedc3e 100644
--- a/hud/cli/__init__.py
+++ b/hud/cli/__init__.py
@@ -21,6 +21,7 @@
 from .debug import debug_mcp_stdio
 from .deploy import deploy_command
 from .dev import run_mcp_dev_server
+from .eval import eval_command
 from .link import link_command
 from .pull import pull_command
 from .push import push_command
@@ -981,7 +982,6 @@ def quickstart() -> None:
     clone("https://github.com/hud-evals/quickstart.git")
 
 
-
 @app.command()
 def validate(
     source: str = typer.Argument(  # type: ignore[arg-type]  # noqa: B008
@@ -993,189 +993,7 @@ def validate(
     validate_command(source)
 
 
-@app.command()
-def eval(
-    source: str | None = typer.Argument(
-        None,
-        help=(
-            "HuggingFace dataset (e.g. 'hud-evals/SheetBench-50') or task JSON file. "
-            "If not provided, looks for task.json in current directory."
-        ),
-    ),
-    agent: str | None = typer.Argument(
-        None,
-        help=(
-            "Agent backend to use (claude, openai, vllm, or litellm). If not provided, will prompt interactively."  # noqa: E501
-        ),
-    ),
-    full: bool = typer.Option(
-        False,
-        "--full",
-        help="Run the entire dataset (omit for single-task debug mode)",
-    ),
-    model: str | None = typer.Option(
-        None,
-        "--model",
-        help="Model name for the chosen agent",
-    ),
-    allowed_tools: str | None = typer.Option(
-        None,
-        "--allowed-tools",
-        help="Comma-separated list of allowed tools",
-    ),
-    max_concurrent: int = typer.Option(
-        30,
-        "--max-concurrent",
-        help="Maximum concurrent tasks (1-200 recommended, prevents rate limits)",
-    ),
-    max_steps: int | None = typer.Option(
-        None,
-        "--max-steps",
-        help="Maximum steps per task (default: 10 for single, 50 for full)",
-    ),
-    verbose: bool = typer.Option(
-        False,
-        "--verbose",
-        "-v",
-        help="Enable verbose output from the agent",
-    ),
-    very_verbose: bool = typer.Option(
-        False,
-        "--very-verbose",
-        "-vv",
-        help="Enable debug-level logs for maximum visibility",
-    ),
-    vllm_base_url: str | None = typer.Option(
-        None,
-        "--vllm-base-url",
-        help="Base URL for vLLM server (when using --agent vllm)",
-    ),
-    group_size: int = typer.Option(
-        1,
-        "--group-size",
-        help="Number of times to run each task (similar to RL training)",
-    ),
-    integration_test: bool = typer.Option(
-        False,
-        "--integration-test",
-        help=(
-            "Run integration_test_tool, where problem is setup, "
-            "actions are applied, and evaluation is performed, without "
-            "spinning up an agent"
-        ),
-    ),
-) -> None:
-    """🚀 Run evaluation on datasets or individual tasks with agents."""
-    from hud.settings import settings
-    from hud.utils.hud_console import HUDConsole
-
-    hud_console = HUDConsole()
-
-    if integration_test:
-        agent = AgentType.INTEGRATION_TEST
-
-    # If no source provided, reuse RL helper to find a tasks file interactively
-    if source is None:
-        try:
-            from hud.cli.utils.tasks import find_tasks_file
-
-            source = find_tasks_file(None, msg="Select a tasks file to run")
-            hud_console.success(f"Selected: {source}")
-        except (FileNotFoundError, Exception):
-            hud_console.error(
-                "No source provided and no task/eval JSON files found in current directory"
-            )
-            hud_console.info(
-                "Usage: hud eval <source> or create a task JSON file (e.g., task.json, tasks.jsonl)"
-            )
-            raise typer.Exit(1) from None
-
-    # Import eval_command lazily to avoid importing agent dependencies
-    try:
-        from .eval import eval_command, get_available_models
-    except ImportError as e:
-        hud_console.error(
-            "Evaluation dependencies are not installed. "
-            "Please install with: pip install 'hud-python[agent]'"
-        )
-        raise typer.Exit(1) from e
-
-    # If no agent specified, fetch available models and prompt for selection
-    base_model = None
-    hud_model_base_map: dict[str, str] = {}
-    if agent is None:
-        # Get available HUD models first
-        hud_models = get_available_models()
-
-        # Build choices starting with HUD models
-        choices = []
-
-        # Add HUD models as agent choices
-        for hud_model in hud_models:
-            model_name = hud_model["name"]
-            hud_model_base_map[model_name] = hud_model["base_model"]
-            vllm_status = " ⚡" if hud_model.get("vllm_url") else ""
-            choices.append({"name": f"{model_name}{vllm_status}", "value": f"{model_name}"})
-
-        # Add standard agent choices
-        choices.extend(
-            [
-                {"name": "Claude 4 Sonnet", "value": AgentType.CLAUDE},
-                {"name": "OpenAI Computer Use", "value": AgentType.OPENAI},
-                {"name": "Gemini Computer Use", "value": AgentType.GEMINI},
-                {"name": "vLLM (Local Server)", "value": AgentType.VLLM},
-                {"name": "LiteLLM (Multi-provider)", "value": AgentType.LITELLM},
-            ]
-        )
-
-        agent = hud_console.select("Select an agent to use:", choices=choices, default=0)
-
-    # Handle HUD model selection
-    if agent and agent not in [e.value for e in AgentType]:
-        # Find remote model name
-        model = agent
-        if not vllm_base_url:
-            vllm_base_url = f"{settings.hud_rl_url}/models/{model}/vllm"
-
-        # Set model to base model for the vllm endpoint
-        if not base_model:
-            if not hud_model_base_map:
-                hud_models = get_available_models()
-                for hud_model in hud_models:
-                    hud_model_base_map[hud_model["name"]] = hud_model["base_model"]
-            base_model = hud_model_base_map.get(model)
-        if not base_model:
-            hud_console.error(f"Model {model} not found")
-            raise typer.Exit(1)
-        model = base_model
-        agent = AgentType.VLLM  # Use vLLM backend for HUD models
-        hud_console.info(f"Using HUD model: {model} (trained on {base_model})")
-
-    # Validate agent choice
-    valid_agents = [e.value for e in AgentType]
-    if agent not in valid_agents:
-        hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
-        raise typer.Exit(1)
-
-    # Type narrowing: agent is now guaranteed to be an AgentType value after validation
-    agent = AgentType(agent)
-
-    # Run the command
-    eval_command(
-        source=source,
-        full=full,
-        agent=agent,
-        model=model,
-        allowed_tools=allowed_tools,
-        max_concurrent=max_concurrent,
-        max_steps=max_steps,
-        verbose=verbose,
-        very_verbose=very_verbose,
-        vllm_base_url=vllm_base_url,
-        group_size=group_size,
-        integration_test=integration_test,
-    )
-
+app.command(name="eval")(eval_command)
 
 
 @app.command()
diff --git a/hud/cli/validate.py b/hud/cli/validate.py
index baad9dad5..9ceeaa345 100644
--- a/hud/cli/validate.py
+++ b/hud/cli/validate.py
@@ -4,7 +4,7 @@
 
 import json
 from pathlib import Path
-from typing import Any
+from typing import Any, cast
 
 import typer
 from pydantic import ValidationError
@@ -65,7 +65,7 @@ def _load_raw_tasks(source: str) -> tuple[list[dict[str, Any]], list[str]]:
     path = Path(source)
     if path.exists() and path.suffix.lower() in {".json", ".jsonl"}:
         return _load_raw_from_file(path)
-    return load_tasks(source, raw=True), []
+    return cast(list[dict[str, Any]], load_tasks(source, raw=True)), []
 
 
 def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]:

From 0191b4d1a8246f38c8b7b74ac05cc765a2c22f1d Mon Sep 17 00:00:00 2001
From: MagellaX <alphacr792@gmail.com>
Date: Mon, 26 Jan 2026 18:24:22 +0530
Subject: [PATCH 5/5] Fix ruff formatting and JSONL validation

---
 hud/cli/__init__.py            |  7 ++-----
 hud/cli/validate.py            | 11 ++++++++---
 hud/tests/test_validate_cli.py |  6 +++++-
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py
index f32cedc3e..644baa647 100644
--- a/hud/cli/__init__.py
+++ b/hud/cli/__init__.py
@@ -26,15 +26,12 @@
 from .pull import pull_command
 from .push import push_command
 from .remove import remove_command
-
-from .validate import validate_command
-
 from .rft import rft_command
 from .rft_status import rft_status_command
-
 from .utils.config import set_env_values
 from .utils.cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
 from .utils.logging import CaptureLogger
+from .validate import validate_command
 
 # Create the main Typer app
 app = typer.Typer(
@@ -984,7 +981,7 @@ def quickstart() -> None:
 
 @app.command()
 def validate(
-    source: str = typer.Argument(  # type: ignore[arg-type]  # noqa: B008
+    source: str = typer.Argument(  # type: ignore[arg-type]
         ...,
         help="Tasks file path or dataset slug (e.g. ./tasks.json or hud-evals/SheetBench-50)",
     ),
diff --git a/hud/cli/validate.py b/hud/cli/validate.py
index 9ceeaa345..1c8f1d4b3 100644
--- a/hud/cli/validate.py
+++ b/hud/cli/validate.py
@@ -65,7 +65,7 @@ def _load_raw_tasks(source: str) -> tuple[list[dict[str, Any]], list[str]]:
     path = Path(source)
     if path.exists() and path.suffix.lower() in {".json", ".jsonl"}:
         return _load_raw_from_file(path)
-    return cast(list[dict[str, Any]], load_tasks(source, raw=True)), []
+    return cast("list[dict[str, Any]]", load_tasks(source, raw=True)), []
 
 
 def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]:
@@ -78,7 +78,11 @@ def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]:
                 line = line.strip()
                 if not line:
                     continue
-                value = json.loads(line)
+                try:
+                    value = json.loads(line)
+                except json.JSONDecodeError as e:
+                    errors.append(f"line {line_no}: invalid JSON ({e.msg})")
+                    continue
                 if isinstance(value, dict):
                     items.append(value)
                     continue
@@ -87,8 +91,9 @@ def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]:
                         if isinstance(entry, dict):
                             items.append(entry)
                         else:
+                            entry_type = type(entry).__name__
                             errors.append(
-                                f"line {line_no} item {idx}: expected object, got {type(entry).__name__}"
+                                f"line {line_no} item {idx}: expected object, got {entry_type}"
                             )
                     continue
                 errors.append(
diff --git a/hud/tests/test_validate_cli.py b/hud/tests/test_validate_cli.py
index 52d10d335..0adddf774 100644
--- a/hud/tests/test_validate_cli.py
+++ b/hud/tests/test_validate_cli.py
@@ -46,7 +46,11 @@ def test_validate_command_invalid(tmp_path: Path) -> None:
 def test_validate_command_flags_non_dict_entries(tmp_path: Path) -> None:
     validate_command = _load_validate_command()
     tasks = [
-        {"prompt": "ok", "mcp_config": {"local": {"command": "echo", "args": ["hi"]}}},
+        {
+            "prompt": "ok",
+            "mcp_config": {"local": {"command": "echo", "args": ["hi"]}},
+            "evaluate_tool": {"name": "done", "arguments": {}},
+        },
         "not a task",
     ]
     path = _write_tasks(tmp_path / "tasks.json", tasks)