Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 55 additions & 14 deletions eval.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@
}
},
"score_details": {
"type" : "object",
"description": "The score for the evaluation and related details",
"required": [
"score"
Expand All @@ -244,21 +245,61 @@
"details": {
"$ref": "#/$defs/additional_properties_object"
},
"confidence_interval": {
"uncertainty": {
"type": "object",
"description": "Confidence interval for the score",
"description": "Quantification of uncertainty around the reported score",
"properties": {
"lower": {
"type": "number",
"description": "Lower bound of the confidence interval"
"standard_error": {
"type": "object",
"description": "Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))",
"properties": {
"value": {
"type": "number",
"description": "The standard error value"
},
"method": {
"type": "string",
"description": "How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')"
}
},
"required": ["value"]
},
"confidence_interval": {
"type": "object",
"description": "Lower and upper bounds for the metric at a given confidence level.",
"properties": {
"lower": {
"type": "number",
"description": "Lower bound of the confidence interval"
},
"upper": {
"type": "number",
"description": "Upper bound of the confidence interval"
},
"confidence_level": {
"type": "number",
"description": "Confidence level (e.g. 0.95 for a 95% confidence interval)",
"minimum": 0,
"maximum": 1
},
"method": {
"type": "string",
"description": "How the confidence interval was computed"
}
},
"required": ["lower", "upper", "confidence_level"]
},
"upper": {
"standard_deviation": {
"type": "number",
"description": "Upper bound of the confidence interval"
"description": "Standard deviation of the per-sample scores"
},
"method": {
"type": "string",
"description": "How it was computed (e.g. bootstrap, holdout)"
"num_samples": {
"type": "integer",
"description": "Number of samples used to compute the uncertainty estimates"
},
"num_bootstrap_samples": {
"type": "integer",
"description": "Number of bootstrap resamples used, if bootstrap method was applied"
}
}
}
Expand Down Expand Up @@ -367,15 +408,15 @@
"description": "Listed evaluation limits like time limit, message limit, token limit.",
"properties": {
"time_limit": {
"type": "int",
"type": "integer",
"description": "Time limit for evaluation."
},
"message_limit": {
"type": "int",
"type": "integer",
"description": "Message limit for evaluation."
},
"token_limit": {
"type": "int",
"type": "integer",
"description": "Token limit for evaluation."
}
}
Expand Down Expand Up @@ -609,4 +650,4 @@
}
}
}
}
}
54 changes: 41 additions & 13 deletions eval_types.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# generated by datamodel-codegen:
# filename: eval.schema.json
# timestamp: 2026-02-09T17:16:13+00:00
# timestamp: 2026-02-10T16:07:04+00:00

from __future__ import annotations

from enum import Enum
from typing import Any, Literal

from pydantic import BaseModel, ConfigDict, Field, conint
from pydantic import BaseModel, ConfigDict, Field, confloat, conint


class SourceType(Enum):
Expand Down Expand Up @@ -58,22 +58,50 @@ class AggregationMethod(Enum):
median = 'median'


class ConfidenceInterval(BaseModel):
lower: float | None = Field(
None, description='Lower bound of the confidence interval'
class StandardError(BaseModel):
value: float = Field(..., description='The standard error value')
method: str | None = Field(
None,
description="How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')",
)
upper: float | None = Field(
None, description='Upper bound of the confidence interval'


class ConfidenceInterval(BaseModel):
lower: float = Field(..., description='Lower bound of the confidence interval')
upper: float = Field(..., description='Upper bound of the confidence interval')
confidence_level: confloat(ge=0.0, le=1.0) = Field(
..., description='Confidence level (e.g. 0.95 for a 95% confidence interval)'
)
method: str | None = Field(
None, description='How it was computed (e.g. bootstrap, holdout)'
None, description='How the confidence interval was computed'
)


class Uncertainty(BaseModel):
standard_error: StandardError | None = Field(
None,
description='Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))',
)
confidence_interval: ConfidenceInterval | None = Field(
None,
description='Lower and upper bounds for the metric at a given confidence level.',
)
standard_deviation: float | None = Field(
None, description='Standard deviation of the per-sample scores'
)
num_samples: int | None = Field(
None, description='Number of samples used to compute the uncertainty estimates'
)
num_bootstrap_samples: int | None = Field(
None,
description='Number of bootstrap resamples used, if bootstrap method was applied',
)


class EvalLimits(BaseModel):
time_limit: Any | None = Field(None, description='Time limit for evaluation.')
message_limit: Any | None = Field(None, description='Message limit for evaluation.')
token_limit: Any | None = Field(None, description='Token limit for evaluation.')
time_limit: int | None = Field(None, description='Time limit for evaluation.')
message_limit: int | None = Field(None, description='Message limit for evaluation.')
token_limit: int | None = Field(None, description='Token limit for evaluation.')


class Sandbox(BaseModel):
Expand Down Expand Up @@ -180,8 +208,8 @@ class SourceDataPrivate(BaseModel):
class ScoreDetails(BaseModel):
score: float = Field(..., description='The score for the evaluation')
details: AdditionalPropertiesObject | None = None
confidence_interval: ConfidenceInterval | None = Field(
None, description='Confidence interval for the score'
uncertainty: Uncertainty | None = Field(
None, description='Quantification of uncertainty around the reported score'
)


Expand Down
2 changes: 1 addition & 1 deletion instance_level_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generated by datamodel-codegen:
# filename: instance_level_eval.schema.json
# timestamp: 2026-02-09T17:11:20+00:00
# timestamp: 2026-02-10T16:07:05+00:00

from __future__ import annotations

Expand Down
Loading