Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
e0c56fb
When unset, take it as success
kerthcet Feb 18, 2026
f6c9203
update the workflow icon
kerthcet Feb 18, 2026
2d2ee2f
Use graphql for biz logic
kerthcet Feb 19, 2026
f1efb0f
add tests
kerthcet Feb 19, 2026
095edb7
update tests
kerthcet Feb 19, 2026
fbb9782
update the layout of the sidebar
kerthcet Feb 19, 2026
7a3e24b
update the layout
kerthcet Feb 19, 2026
417194f
update test
kerthcet Feb 19, 2026
a506c68
fix test
kerthcet Feb 19, 2026
6b7267c
fix test
kerthcet Feb 19, 2026
8031463
fix test
kerthcet Feb 19, 2026
35e0916
fix test
kerthcet Feb 19, 2026
039e5c3
fix test
kerthcet Feb 19, 2026
feb9315
fix lint
kerthcet Feb 19, 2026
a9d205a
fix spans
kerthcet Feb 19, 2026
632a1b5
fix lint
kerthcet Feb 19, 2026
eb8a66f
fix query error
kerthcet Feb 19, 2026
d013cba
fix test
kerthcet Feb 19, 2026
fd77aeb
fix test
kerthcet Feb 19, 2026
cd482c9
fix test
kerthcet Feb 19, 2026
b006008
fix test
kerthcet Feb 19, 2026
4735ac2
fix test
kerthcet Feb 19, 2026
d4fef87
fix test
kerthcet Feb 19, 2026
60e3835
fix test
kerthcet Feb 19, 2026
95daec7
fix test
kerthcet Feb 19, 2026
1824f29
fix test
kerthcet Feb 19, 2026
d5eb521
use lock to fix concurrent problem
kerthcet Feb 19, 2026
4954729
debug
kerthcet Feb 19, 2026
8d2776c
disable batch in integration tests
kerthcet Feb 19, 2026
880f37b
use env for batch
kerthcet Feb 19, 2026
391703b
debug
kerthcet Feb 19, 2026
2f77c6d
debug
kerthcet Feb 19, 2026
145e1b1
remove execution result from metadata
kerthcet Feb 19, 2026
6d00a76
fix
kerthcet Feb 19, 2026
2882716
fix sidebar
kerthcet Feb 19, 2026
b4c5ab8
optimize the layout of traces
kerthcet Feb 19, 2026
586d063
optimize the layout of traces
kerthcet Feb 20, 2026
65505b9
uptimize the layout
kerthcet Feb 20, 2026
f056b26
uptimize the layout
kerthcet Feb 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ ALPHATRION_ARTIFACT_INSECURE=false
# Tracing configurations
ALPHATRION_ENABLE_TRACING=true
ALPHATRION_CLICKHOUSE_INIT_TABLES=true
ALPHATRION_CLICKHOUSE_ENABLE_BATCH=true
3 changes: 2 additions & 1 deletion .env.integration-test
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ ALPHATRION_ARTIFACT_INSECURE=true
ALPHATRION_LOG_LEVEL=INFO
ALPHATRION_AUTO_CLEANUP=true
ALPHATRION_ENABLE_TRACING=true
ALPHATRION_CLICKHOUSE_INIT_TABLES=true
ALPHATRION_CLICKHOUSE_INIT_TABLES=true
ALPHATRION_CLICKHOUSE_ENABLE_BATCH=true
2 changes: 1 addition & 1 deletion alphatrion/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
CLICKHOUSE_DATABASE = "ALPHATRION_CLICKHOUSE_DATABASE"
CLICKHOUSE_USERNAME = "ALPHATRION_CLICKHOUSE_USERNAME"
CLICKHOUSE_PASSWORD = "ALPHATRION_CLICKHOUSE_PASSWORD"
INIT_CLICKHOUSE_TABLES = "ALPHATRION_INIT_CLICKHOUSE_TABLES"
CLICKHOUSE_ENABLE_BATCH = "ALPHATRION_CLICKHOUSE_ENABLE_BATCH"

# Dashboard only related envs
DASHBOARD_USER_ID = "ALPHATRION_DASHBOARD_USER_ID"
Expand Down
78 changes: 73 additions & 5 deletions alphatrion/server/graphql/resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,13 +220,23 @@ def get_run(id: strawberry.ID) -> Run | None:
metadb = runtime.storage_runtime().metadb
run = metadb.get_run(run_id=uuid.UUID(id))
if run:
meta = run.meta or {}

# Aggregate and cache tokens for completed runs.
# It could be slow for the first time.
if Status(run.status) == Status.COMPLETED and "total_tokens" not in meta:
token_data = GraphQLResolvers.aggregate_run_tokens(run_id=id)
if token_data["total_tokens"] > 0:
meta.update(token_data)
metadb.update_run(run_id=uuid.UUID(id), meta=meta)

return Run(
id=run.uuid,
team_id=run.team_id,
user_id=run.user_id,
project_id=run.project_id,
experiment_id=run.experiment_id,
meta=run.meta,
meta=meta,
status=GraphQLStatusEnum[Status(run.status).name],
created_at=run.created_at,
)
Expand All @@ -250,6 +260,24 @@ def list_exp_metrics(experiment_id: strawberry.ID) -> list[Metric]:
for m in metrics
]

@staticmethod
def list_run_metrics(run_id: strawberry.ID) -> list[Metric]:
metadb = runtime.storage_runtime().metadb
metrics = metadb.list_metrics_by_run_id(run_id=run_id)
return [
Metric(
id=m.uuid,
key=m.key,
value=m.value,
team_id=m.team_id,
project_id=m.project_id,
experiment_id=m.experiment_id,
run_id=m.run_id,
created_at=m.created_at,
)
for m in metrics
]

@staticmethod
def total_projects(team_id: strawberry.ID) -> int:
metadb = runtime.storage_runtime().metadb
Expand Down Expand Up @@ -373,8 +401,48 @@ async def get_artifact_content(
raise RuntimeError(f"Failed to get artifact content: {e}") from e

@staticmethod
def list_traces(run_id: strawberry.ID) -> list[Span]:
"""List all traces/spans for a specific run."""
def aggregate_run_tokens(run_id: strawberry.ID) -> dict[str, int]:
"""Aggregate token usage from all traces for a run."""
from alphatrion import envs

# Check if tracing is enabled
if os.getenv(envs.ENABLE_TRACING, "false").lower() != "true":
return {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}

try:
trace_store = runtime.storage_runtime().tracestore
spans = trace_store.get_spans_by_run_id(uuid.UUID(run_id))
trace_store.close()

total_tokens = 0
input_tokens = 0
output_tokens = 0

for span in spans:
span_attrs = span.get("SpanAttributes", {})

# Aggregate tokens from LLM spans
if "llm.usage.total_tokens" in span_attrs:
total_tokens += int(span_attrs["llm.usage.total_tokens"])
if "gen_ai.usage.input_tokens" in span_attrs:
input_tokens += int(span_attrs["gen_ai.usage.input_tokens"])
if "gen_ai.usage.output_tokens" in span_attrs:
output_tokens += int(span_attrs["gen_ai.usage.output_tokens"])

return {
"total_tokens": total_tokens,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
}
except Exception as e:
import logging

logging.error(f"Failed to aggregate tokens for run {run_id}: {e}")
return {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}

@staticmethod
def list_spans(run_id: strawberry.ID) -> list[Span]:
"""List all spans for a specific run."""
from alphatrion import envs

# Check if tracing is enabled
Expand All @@ -385,12 +453,12 @@ def list_traces(run_id: strawberry.ID) -> list[Span]:
trace_store = runtime.storage_runtime().tracestore

# Get traces from ClickHouse
traces = trace_store.get_traces_by_run_id(uuid.UUID(run_id))
raw_spans = trace_store.get_spans_by_run_id(uuid.UUID(run_id))
trace_store.close()

# Convert to GraphQL Span objects
spans = []
for t in traces:
for t in raw_spans:
# Convert events
events = []
if t.get("Events"):
Expand Down
14 changes: 14 additions & 0 deletions alphatrion/server/graphql/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,20 @@ class Run:
status: GraphQLStatusEnum
created_at: datetime

@strawberry.field
def metrics(self) -> list["Metric"]:
"""Get metrics for this run."""
from alphatrion.server.graphql.resolvers import GraphQLResolvers

return GraphQLResolvers.list_run_metrics(run_id=self.id)

@strawberry.field
def spans(self) -> list["Span"]:
"""Get spans for this run."""
from alphatrion.server.graphql.resolvers import GraphQLResolvers

return GraphQLResolvers.list_spans(run_id=str(self.id))


@strawberry.type
class Metric:
Expand Down
12 changes: 11 additions & 1 deletion alphatrion/storage/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os

from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from traceloop.sdk import Traceloop

from alphatrion import envs
Expand Down Expand Up @@ -38,10 +39,13 @@ def __init__(self):
== "true",
)

enable_batch = (
os.getenv(envs.CLICKHOUSE_ENABLE_BATCH, "true").lower() == "true"
)
Traceloop.init(
app_name="alphatrion",
exporter=ClickHouseSpanExporter(self.tracestore),
disable_batch=False, # Enable batching
disable_batch=not enable_batch,
telemetry_enabled=False,
)

Expand All @@ -60,6 +64,12 @@ def metadb(self):
def tracestore(self):
return self._tracestore

def flush(self):
if self._tracestore:
tracer_provider = trace.get_tracer_provider()
if isinstance(tracer_provider, TracerProvider):
tracer_provider.force_flush(timeout_millis=5000)


def init():
"""
Expand Down
11 changes: 11 additions & 0 deletions alphatrion/storage/sqlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,3 +695,14 @@ def list_metrics_by_experiment_id(self, experiment_id: uuid.UUID) -> list[Metric
)
session.close()
return metrics

def list_metrics_by_run_id(self, run_id: uuid.UUID) -> list[Metric]:
session = self._session()
metrics = (
session.query(Metric)
.filter(Metric.run_id == run_id)
.order_by(Metric.created_at.asc())
.all()
)
session.close()
return metrics
Loading
Loading