From 8920f2b4db94e7ad862978c2034d256fc09db4ba Mon Sep 17 00:00:00 2001
From: SreeHarshaNelaturu <nelaturu.harsha@gmail.com>
Date: Tue, 10 Feb 2026 17:07:35 +0100
Subject: [PATCH] schema fix for uncertainty

---
 eval.schema.json        | 69 ++++++++++++++++++++++++++++++++---------
 eval_types.py           | 54 ++++++++++++++++++++++++--------
 instance_level_types.py |  2 +-
 3 files changed, 97 insertions(+), 28 deletions(-)

diff --git a/eval.schema.json b/eval.schema.json
index 3abc19abe..c9917b4ad 100644
--- a/eval.schema.json
+++ b/eval.schema.json
@@ -232,6 +232,7 @@
                         }
                     },
                     "score_details": {
+                        "type" : "object",
                         "description": "The score for the evaluation and related details",
                         "required": [
                             "score"
@@ -244,21 +245,61 @@
                             "details": {
                                 "$ref": "#/$defs/additional_properties_object"
                             },
-                            "confidence_interval": {
+                            "uncertainty": {
                                 "type": "object",
-                                "description": "Confidence interval for the score",
+                                "description": "Quantification of uncertainty around the reported score",
                                 "properties": {
-                                    "lower": {
-                                        "type": "number",
-                                        "description": "Lower bound of the confidence interval"
+                                    "standard_error": {
+                                        "type": "object",
+                                        "description": "Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))",
+                                        "properties": {
+                                            "value": {
+                                                "type": "number",
+                                                "description": "The standard error value"
+                                            },
+                                            "method": {
+                                                "type": "string",
+                                                "description": "How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')"
+                                            }
+                                        },
+                                        "required": ["value"]
+                                    },
+                                    "confidence_interval": {
+                                        "type": "object",
+                                        "description": "Lower and upper bounds for the metric at a given confidence level.",
+                                        "properties": {
+                                            "lower": {
+                                                "type": "number",
+                                                "description": "Lower bound of the confidence interval"
+                                            },
+                                            "upper": {
+                                                "type": "number",
+                                                "description": "Upper bound of the confidence interval"
+                                            },
+                                            "confidence_level": {
+                                                "type": "number",
+                                                "description": "Confidence level (e.g. 0.95 for a 95% confidence interval)",
+                                                "minimum": 0,
+                                                "maximum": 1
+                                            },
+                                            "method": {
+                                                "type": "string",
+                                                "description": "How the confidence interval was computed"
+                                            }
+                                        },
+                                        "required": ["lower", "upper", "confidence_level"]
                                     },
-                                    "upper": {
+                                    "standard_deviation": {
                                         "type": "number",
-                                        "description": "Upper bound of the confidence interval"
+                                        "description": "Standard deviation of the per-sample scores"
                                     },
-                                    "method": {
-                                        "type": "string",
-                                        "description": "How it was computed (e.g. bootstrap, holdout)"
+                                    "num_samples": {
+                                        "type": "integer",
+                                        "description": "Number of samples used to compute the uncertainty estimates"
+                                    },
+                                    "num_bootstrap_samples": {
+                                        "type": "integer",
+                                        "description": "Number of bootstrap resamples used, if bootstrap method was applied"
                                     }
                                 }
                             }
@@ -367,15 +408,15 @@
                                         "description": "Listed evaluation limits like time limit, message limit, token limit.",
                                         "properties": {
                                             "time_limit": {
-                                                "type": "int",
+                                                "type": "integer",
                                                 "description": "Time limit for evaluation."
                                             },
                                             "message_limit": {
-                                                "type": "int",
+                                                "type": "integer",
                                                 "description": "Message limit for evaluation."
                                             },
                                             "token_limit": {
-                                                "type": "int",
+                                                "type": "integer",
                                                 "description": "Token limit for evaluation."
                                             }
                                         }
@@ -609,4 +650,4 @@
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/eval_types.py b/eval_types.py
index e58579de3..e4d8a9e44 100644
--- a/eval_types.py
+++ b/eval_types.py
@@ -1,13 +1,13 @@
 # generated by datamodel-codegen:
 #   filename:  eval.schema.json
-#   timestamp: 2026-02-09T17:16:13+00:00
+#   timestamp: 2026-02-10T16:07:04+00:00
 
 from __future__ import annotations
 
 from enum import Enum
 from typing import Any, Literal
 
-from pydantic import BaseModel, ConfigDict, Field, conint
+from pydantic import BaseModel, ConfigDict, Field, confloat, conint
 
 
 class SourceType(Enum):
@@ -58,22 +58,50 @@ class AggregationMethod(Enum):
     median = 'median'
 
 
-class ConfidenceInterval(BaseModel):
-    lower: float | None = Field(
-        None, description='Lower bound of the confidence interval'
+class StandardError(BaseModel):
+    value: float = Field(..., description='The standard error value')
+    method: str | None = Field(
+        None,
+        description="How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')",
     )
-    upper: float | None = Field(
-        None, description='Upper bound of the confidence interval'
+
+
+class ConfidenceInterval(BaseModel):
+    lower: float = Field(..., description='Lower bound of the confidence interval')
+    upper: float = Field(..., description='Upper bound of the confidence interval')
+    confidence_level: confloat(ge=0.0, le=1.0) = Field(
+        ..., description='Confidence level (e.g. 0.95 for a 95% confidence interval)'
     )
     method: str | None = Field(
-        None, description='How it was computed (e.g. bootstrap, holdout)'
+        None, description='How the confidence interval was computed'
+    )
+
+
+class Uncertainty(BaseModel):
+    standard_error: StandardError | None = Field(
+        None,
+        description='Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))',
+    )
+    confidence_interval: ConfidenceInterval | None = Field(
+        None,
+        description='Lower and upper bounds for the metric at a given confidence level.',
+    )
+    standard_deviation: float | None = Field(
+        None, description='Standard deviation of the per-sample scores'
+    )
+    num_samples: int | None = Field(
+        None, description='Number of samples used to compute the uncertainty estimates'
+    )
+    num_bootstrap_samples: int | None = Field(
+        None,
+        description='Number of bootstrap resamples used, if bootstrap method was applied',
     )
 
 
 class EvalLimits(BaseModel):
-    time_limit: Any | None = Field(None, description='Time limit for evaluation.')
-    message_limit: Any | None = Field(None, description='Message limit for evaluation.')
-    token_limit: Any | None = Field(None, description='Token limit for evaluation.')
+    time_limit: int | None = Field(None, description='Time limit for evaluation.')
+    message_limit: int | None = Field(None, description='Message limit for evaluation.')
+    token_limit: int | None = Field(None, description='Token limit for evaluation.')
 
 
 class Sandbox(BaseModel):
@@ -180,8 +208,8 @@ class SourceDataPrivate(BaseModel):
 class ScoreDetails(BaseModel):
     score: float = Field(..., description='The score for the evaluation')
     details: AdditionalPropertiesObject | None = None
-    confidence_interval: ConfidenceInterval | None = Field(
-        None, description='Confidence interval for the score'
+    uncertainty: Uncertainty | None = Field(
+        None, description='Quantification of uncertainty around the reported score'
     )
 
 
diff --git a/instance_level_types.py b/instance_level_types.py
index 36cacc578..792dd922c 100644
--- a/instance_level_types.py
+++ b/instance_level_types.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  instance_level_eval.schema.json
-#   timestamp: 2026-02-09T17:11:20+00:00
+#   timestamp: 2026-02-10T16:07:05+00:00
 
 from __future__ import annotations