From 8920f2b4db94e7ad862978c2034d256fc09db4ba Mon Sep 17 00:00:00 2001 From: SreeHarshaNelaturu Date: Tue, 10 Feb 2026 17:07:35 +0100 Subject: [PATCH] schema fix for uncertainty --- eval.schema.json | 69 ++++++++++++++++++++++++++++++++--------- eval_types.py | 54 ++++++++++++++++++++++++-------- instance_level_types.py | 2 +- 3 files changed, 97 insertions(+), 28 deletions(-) diff --git a/eval.schema.json b/eval.schema.json index 3abc19abe..c9917b4ad 100644 --- a/eval.schema.json +++ b/eval.schema.json @@ -232,6 +232,7 @@ } }, "score_details": { + "type" : "object", "description": "The score for the evaluation and related details", "required": [ "score" @@ -244,21 +245,61 @@ "details": { "$ref": "#/$defs/additional_properties_object" }, - "confidence_interval": { + "uncertainty": { "type": "object", - "description": "Confidence interval for the score", + "description": "Quantification of uncertainty around the reported score", "properties": { - "lower": { - "type": "number", - "description": "Lower bound of the confidence interval" + "standard_error": { + "type": "object", + "description": "Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))", + "properties": { + "value": { + "type": "number", + "description": "The standard error value" + }, + "method": { + "type": "string", + "description": "How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')" + } + }, + "required": ["value"] + }, + "confidence_interval": { + "type": "object", + "description": "Lower and upper bounds for the metric at a given confidence level.", + "properties": { + "lower": { + "type": "number", + "description": "Lower bound of the confidence interval" + }, + "upper": { + "type": "number", + "description": "Upper bound of the confidence interval" + }, + "confidence_level": { + "type": "number", + "description": "Confidence level (e.g. 0.95 for a 95% confidence interval)", + "minimum": 0, + "maximum": 1 + }, + "method": { + "type": "string", + "description": "How the confidence interval was computed" + } + }, + "required": ["lower", "upper", "confidence_level"] }, - "upper": { + "standard_deviation": { "type": "number", - "description": "Upper bound of the confidence interval" + "description": "Standard deviation of the per-sample scores" }, - "method": { - "type": "string", - "description": "How it was computed (e.g. bootstrap, holdout)" + "num_samples": { + "type": "integer", + "description": "Number of samples used to compute the uncertainty estimates" + }, + "num_bootstrap_samples": { + "type": "integer", + "description": "Number of bootstrap resamples used, if bootstrap method was applied" } } } @@ -367,15 +408,15 @@ "description": "Listed evaluation limits like time limit, message limit, token limit.", "properties": { "time_limit": { - "type": "int", + "type": "integer", "description": "Time limit for evaluation." }, "message_limit": { - "type": "int", + "type": "integer", "description": "Message limit for evaluation." }, "token_limit": { - "type": "int", + "type": "integer", "description": "Token limit for evaluation." } } @@ -609,4 +650,4 @@ } } } -} \ No newline at end of file +} diff --git a/eval_types.py b/eval_types.py index e58579de3..e4d8a9e44 100644 --- a/eval_types.py +++ b/eval_types.py @@ -1,13 +1,13 @@ # generated by datamodel-codegen: # filename: eval.schema.json -# timestamp: 2026-02-09T17:16:13+00:00 +# timestamp: 2026-02-10T16:07:04+00:00 from __future__ import annotations from enum import Enum from typing import Any, Literal -from pydantic import BaseModel, ConfigDict, Field, conint +from pydantic import BaseModel, ConfigDict, Field, confloat, conint class SourceType(Enum): @@ -58,22 +58,50 @@ class AggregationMethod(Enum): median = 'median' -class ConfidenceInterval(BaseModel): - lower: float | None = Field( - None, description='Lower bound of the confidence interval' +class StandardError(BaseModel): + value: float = Field(..., description='The standard error value') + method: str | None = Field( + None, + description="How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')", ) - upper: float | None = Field( - None, description='Upper bound of the confidence interval' + + +class ConfidenceInterval(BaseModel): + lower: float = Field(..., description='Lower bound of the confidence interval') + upper: float = Field(..., description='Upper bound of the confidence interval') + confidence_level: confloat(ge=0.0, le=1.0) = Field( + ..., description='Confidence level (e.g. 0.95 for a 95% confidence interval)' ) method: str | None = Field( - None, description='How it was computed (e.g. bootstrap, holdout)' + None, description='How the confidence interval was computed' + ) + + +class Uncertainty(BaseModel): + standard_error: StandardError | None = Field( + None, + description='Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))', + ) + confidence_interval: ConfidenceInterval | None = Field( + None, + description='Lower and upper bounds for the metric at a given confidence level.', + ) + standard_deviation: float | None = Field( + None, description='Standard deviation of the per-sample scores' + ) + num_samples: int | None = Field( + None, description='Number of samples used to compute the uncertainty estimates' + ) + num_bootstrap_samples: int | None = Field( + None, + description='Number of bootstrap resamples used, if bootstrap method was applied', ) class EvalLimits(BaseModel): - time_limit: Any | None = Field(None, description='Time limit for evaluation.') - message_limit: Any | None = Field(None, description='Message limit for evaluation.') - token_limit: Any | None = Field(None, description='Token limit for evaluation.') + time_limit: int | None = Field(None, description='Time limit for evaluation.') + message_limit: int | None = Field(None, description='Message limit for evaluation.') + token_limit: int | None = Field(None, description='Token limit for evaluation.') class Sandbox(BaseModel): @@ -180,8 +208,8 @@ class SourceDataPrivate(BaseModel): class ScoreDetails(BaseModel): score: float = Field(..., description='The score for the evaluation') details: AdditionalPropertiesObject | None = None - confidence_interval: ConfidenceInterval | None = Field( - None, description='Confidence interval for the score' + uncertainty: Uncertainty | None = Field( + None, description='Quantification of uncertainty around the reported score' ) diff --git a/instance_level_types.py b/instance_level_types.py index 36cacc578..792dd922c 100644 --- a/instance_level_types.py +++ b/instance_level_types.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: instance_level_eval.schema.json -# timestamp: 2026-02-09T17:11:20+00:00 +# timestamp: 2026-02-10T16:07:05+00:00 from __future__ import annotations