From 5f13d244d6c6d92dae1014103b6e7bd4856f2e95 Mon Sep 17 00:00:00 2001 From: Farrel Mahaztra <15523645+farrelmahaztra@users.noreply.github.com> Date: Wed, 25 Feb 2026 01:40:24 +0700 Subject: [PATCH 1/3] Emit scenario spans --- hud/eval/context.py | 100 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 95 insertions(+), 5 deletions(-) diff --git a/hud/eval/context.py b/hud/eval/context.py index 16cbea38..f5b55ca5 100644 --- a/hud/eval/context.py +++ b/hud/eval/context.py @@ -18,7 +18,8 @@ from hud.environment import Environment from hud.settings import settings from hud.shared import make_request -from hud.telemetry import flush, instrument +from hud.telemetry import flush, instrument, queue_span +from hud.telemetry.instrument import _normalize_trace_id, _now_iso if TYPE_CHECKING: from collections.abc import Generator @@ -388,29 +389,118 @@ def from_task( return ctx + def _emit_scenario_span( + self, + name: str, + status: str, + scenario_name: str, + start_time: str, + end_time: str | None = None, + result: Any = None, + error: str | None = None, + ) -> None: + """Emit a scenario lifecycle span for real-time stage visibility.""" + span = { + "name": name, + "trace_id": _normalize_trace_id(self.trace_id), + "span_id": uuid.uuid4().hex[:16], + "parent_span_id": None, + "start_time": start_time, + "end_time": end_time or start_time, + "status_code": "ERROR" if error else "OK", + "status_message": error, + "attributes": { + "task_run_id": self.trace_id, + "category": "scenario", + "type": "CLIENT", + "scenario_name": scenario_name, + "status": status, + "result": result, + }, + "internal_type": f"scenario-{name.split('_')[-1]}", + } + queue_span(span) + async def _run_task_scenario_setup(self) -> None: """Run the task's scenario setup phase (if scenario provided).""" if self._task is None or self._task.scenario is None: return - prompt = await self.run_scenario_setup(self._task.scenario, self._task.args or {}) - if prompt: - self.prompt = prompt + scenario_name = self._task.scenario + start_time = _now_iso() + + self._emit_scenario_span( + "scenario_setup", + "started", + scenario_name, + start_time, + ) + + try: + prompt = await self.run_scenario_setup(scenario_name, self._task.args or {}) + if prompt: + self.prompt = prompt + + self._emit_scenario_span( + "scenario_setup", + "completed", + scenario_name, + start_time, + _now_iso(), + ) + except Exception as e: + self._emit_scenario_span( + "scenario_setup", + "error", + scenario_name, + start_time, + _now_iso(), + error=str(e), + ) + raise async def _run_task_scenario_evaluate(self) -> None: """Run the task's scenario evaluate phase (if scenario provided).""" if self._task is None or self._task.scenario is None: return + scenario_name = self._task.scenario + start_time = _now_iso() + + self._emit_scenario_span( + "scenario_evaluate", + "started", + scenario_name, + start_time, + ) + try: - result = await self.run_scenario_evaluate(self._task.scenario) + result = await self.run_scenario_evaluate(scenario_name) except Exception as e: self.error = e + self._emit_scenario_span( + "scenario_evaluate", + "error", + scenario_name, + start_time, + _now_iso(), + error=str(e), + ) return self.evaluation_result = result self.reward = result.reward + # Emit "completed" span with reward + self._emit_scenario_span( + "scenario_evaluate", + "completed", + scenario_name, + start_time, + _now_iso(), + result={"reward": result.reward} if result else None, + ) + # ========================================================================= # Summary Context - Attribute Access Control # ========================================================================= From 281b78ceb3b1f6b6cc9e8addb2faddffcf75b715 Mon Sep 17 00:00:00 2001 From: Farrel Mahaztra <15523645+farrelmahaztra@users.noreply.github.com> Date: Wed, 25 Feb 2026 01:59:36 +0700 Subject: [PATCH 2/3] Fix exception handling --- hud/eval/context.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/hud/eval/context.py b/hud/eval/context.py index f5b55ca5..09725ce8 100644 --- a/hud/eval/context.py +++ b/hud/eval/context.py @@ -476,6 +476,17 @@ async def _run_task_scenario_evaluate(self) -> None: try: result = await self.run_scenario_evaluate(scenario_name) + self.evaluation_result = result + self.reward = result.reward + + self._emit_scenario_span( + "scenario_evaluate", + "completed", + scenario_name, + start_time, + _now_iso(), + result={"reward": result.reward}, + ) except Exception as e: self.error = e self._emit_scenario_span( @@ -486,20 +497,6 @@ async def _run_task_scenario_evaluate(self) -> None: _now_iso(), error=str(e), ) - return - - self.evaluation_result = result - self.reward = result.reward - - # Emit "completed" span with reward - self._emit_scenario_span( - "scenario_evaluate", - "completed", - scenario_name, - start_time, - _now_iso(), - result={"reward": result.reward} if result else None, - ) # ========================================================================= # Summary Context - Attribute Access Control From 0983cd58bc1008f5519d5d94fbbaa89af79fe89d Mon Sep 17 00:00:00 2001 From: Farrel Mahaztra <15523645+farrelmahaztra@users.noreply.github.com> Date: Wed, 25 Feb 2026 02:07:55 +0700 Subject: [PATCH 3/3] Skip if trace disabled --- hud/eval/context.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hud/eval/context.py b/hud/eval/context.py index 09725ce8..3f05bd8f 100644 --- a/hud/eval/context.py +++ b/hud/eval/context.py @@ -400,6 +400,9 @@ def _emit_scenario_span( error: str | None = None, ) -> None: """Emit a scenario lifecycle span for real-time stage visibility.""" + if not self._trace_enabled: + return + span = { "name": name, "trace_id": _normalize_trace_id(self.trace_id),