-
Notifications
You must be signed in to change notification settings - Fork 244
feat(evals): add page #2182
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
benjamincanac
wants to merge
8
commits into
main
Choose a base branch
from
feat/evals
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
feat(evals): add page #2182
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
25c08da
feat(evals): add page
benjamincanac 92fcd3d
fix lint
benjamincanac 8a357d9
chore: update agent results
benjamincanac c7175ef
feat(app): display avatar
benjamincanac 50b4f43
improve expanded button hover state
benjamincanac e43b50f
feat: add link in footer
benjamincanac 29192e6
fix(evals): various improvements
benjamincanac e261731
feat: update agent results
benjamincanac File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,324 @@ | ||
| <script setup lang="ts"> | ||
| import { h, resolveComponent } from 'vue' | ||
| import type { TableColumn, TableRow } from '@nuxt/ui' | ||
| import { joinURL } from 'ufo' | ||
|
|
||
| const UButton = resolveComponent('UButton') | ||
| const UBadge = resolveComponent('UBadge') | ||
| const UAvatar = resolveComponent('UAvatar') | ||
|
|
||
| definePageMeta({ | ||
| heroBackground: 'opacity-70 -z-10' | ||
| }) | ||
|
|
||
| // Types | ||
| interface EvalResultItem { | ||
| evalPath: string | ||
| result: { | ||
| success: boolean | ||
| duration: number | ||
| evalPath: string | ||
| timestamp: string | ||
| } | ||
| } | ||
|
|
||
| interface Experiment { | ||
| name: string | ||
| timestamp: string | ||
| modelName: string | ||
| agentHarness: string | ||
| } | ||
|
|
||
| interface AgentResultsData { | ||
| metadata: { | ||
| exportedAt: string | ||
| experiments: Experiment[] | ||
| } | ||
| results: Record<string, EvalResultItem[]> | ||
| } | ||
|
|
||
| interface ModelRow { | ||
| model: string | ||
| agent: string | ||
| totalEvals: number | ||
| successRate: number | ||
| evals: EvalResultItem[] | ||
| } | ||
|
|
||
| const { url } = useSiteConfig() | ||
|
|
||
| const [{ data: page }, { data: rawData }] = await Promise.all([ | ||
| useAsyncData('evals', () => queryCollection('evals').first()), | ||
| useAsyncData('agent-results', () => $fetch<AgentResultsData>(joinURL(url, '/agent-results.json'))) | ||
| ]) | ||
|
|
||
| if (!page.value) { | ||
| throw createError({ statusCode: 404, statusMessage: 'Page not found', fatal: true }) | ||
| } | ||
| if (!rawData.value) { | ||
| throw createError({ statusCode: 404, statusMessage: 'Data not found', fatal: true }) | ||
| } | ||
|
|
||
| const title = page.value.title | ||
| const description = page.value.description | ||
|
|
||
| useSeoMeta({ | ||
| titleTemplate: '%s', | ||
| title, | ||
| description, | ||
| ogDescription: description, | ||
| ogTitle: title | ||
| }) | ||
| defineOgImageComponent('Docs', { title, description }) | ||
|
|
||
| // Build experiment map by name | ||
| const experimentMap = computed(() => { | ||
| const map: Record<string, Experiment> = {} | ||
| if (!rawData.value?.metadata?.experiments) return map | ||
| for (const exp of rawData.value.metadata.experiments) { | ||
| map[exp.name] = exp | ||
| } | ||
| return map | ||
| }) | ||
|
|
||
| // Process results into table rows | ||
| const allResults = computed<ModelRow[]>(() => { | ||
| if (!rawData.value?.results) return [] | ||
| const rows: ModelRow[] = [] | ||
| for (const [experimentName, evals] of Object.entries(rawData.value.results)) { | ||
| const experiment = experimentMap.value[experimentName] | ||
| const successes = evals.filter(e => e.result.success).length | ||
| rows.push({ | ||
| model: experiment?.modelName || experimentName, | ||
| agent: experiment?.agentHarness || 'Unknown', | ||
| totalEvals: evals.length, | ||
| successRate: evals.length ? Math.round((successes / evals.length) * 100) : 0, | ||
| evals | ||
| }) | ||
| } | ||
| return rows.sort((a, b) => b.successRate - a.successRate) | ||
| }) | ||
|
|
||
| // Agent filter | ||
| const agents = computed(() => { | ||
| return [...new Set(allResults.value.map(r => r.agent))] | ||
| }) | ||
| const selectedAgents = ref<string[]>([]) | ||
|
|
||
| const filteredResults = computed(() => { | ||
| if (selectedAgents.value.length === 0) { | ||
| return allResults.value | ||
| } | ||
| return allResults.value.filter(r => selectedAgents.value.includes(r.agent)) | ||
| }) | ||
|
|
||
| // Format exported date | ||
| const formattedDate = computed(() => { | ||
| if (!rawData.value?.metadata?.exportedAt) return '' | ||
| const date = new Date(rawData.value.metadata.exportedAt) | ||
| return date.toLocaleDateString('en-US', { month: 'long', day: 'numeric', year: 'numeric' }) | ||
| }) | ||
|
|
||
| // Model avatar mapping | ||
| const modelAvatarMap: Record<string, string> = { | ||
| claude: '/assets/agents/anthropic.avif', | ||
| gpt: '/assets/agents/openai.avif', | ||
| codex: '/assets/agents/openai.avif', | ||
| gemini: '/assets/agents/google.avif', | ||
| deepseek: '/assets/agents/deepseek.avif', | ||
| devstral: '/assets/agents/mistral.avif', | ||
| minimax: '/assets/agents/minimax.avif', | ||
| kat: '/assets/agents/kwaipilot.avif', | ||
| moonshot: '/assets/agents/moonshotai.avif', | ||
| grok: '/assets/agents/xai.avif' | ||
| } | ||
|
|
||
| function getModelAvatar(model: string): string | undefined { | ||
| const lower = model.toLowerCase() | ||
| return Object.entries(modelAvatarMap).find(([key]) => lower.includes(key))?.[1] | ||
| } | ||
|
|
||
| // Format duration from ms to seconds | ||
| function formatDuration(ms: number): string { | ||
| return `${(ms / 1000).toFixed(2)}s` | ||
| } | ||
|
|
||
| // Expanded rows state | ||
| const expanded = ref({}) | ||
|
|
||
| // Toggle expand on row click | ||
| function onSelect(_e: Event, row: TableRow<ModelRow>) { | ||
| row.toggleExpanded() | ||
| } | ||
|
|
||
| // Table columns | ||
| const columns: TableColumn<ModelRow>[] = [ | ||
| { | ||
| id: 'expand', | ||
| meta: { | ||
| class: { | ||
| th: 'w-0', | ||
| td: 'w-0' | ||
| } | ||
| }, | ||
| cell: ({ row }) => h(UButton, { | ||
| 'color': 'neutral', | ||
| 'variant': 'ghost', | ||
| 'icon': 'i-lucide-chevron-right', | ||
| 'square': true, | ||
| 'size': 'sm', | ||
| 'aria-label': 'Expand', | ||
| 'ui': { | ||
| leadingIcon: ['transition-transform', row.getIsExpanded() ? 'duration-200 rotate-90' : ''] | ||
| }, | ||
| 'onClick': (e: Event) => { | ||
| e.stopPropagation() | ||
| row.toggleExpanded() | ||
| }, | ||
| 'class': 'group-hover:bg-elevated' | ||
| }) | ||
| }, | ||
| { | ||
| accessorKey: 'model', | ||
| header: 'Model', | ||
| cell: ({ row }) => h('div', { class: 'flex items-center gap-2' }, [ | ||
| h(UAvatar, { src: getModelAvatar(row.original.model), size: 'xs', loading: 'lazy', class: 'border border-default' }), | ||
| h('span', {}, row.original.model) | ||
| ]) | ||
| }, | ||
| { | ||
| accessorKey: 'agent', | ||
| header: 'Agent' | ||
| }, | ||
| { | ||
| accessorKey: 'totalEvals', | ||
| header: 'Total Evals', | ||
| meta: { | ||
| class: { | ||
| th: 'text-center', | ||
| td: 'text-center' | ||
| } | ||
| } | ||
| }, | ||
| { | ||
| accessorKey: 'successRate', | ||
| header: 'Success Rate', | ||
| meta: { | ||
| class: { | ||
| th: 'text-right', | ||
| td: 'text-right' | ||
| } | ||
| }, | ||
| cell: ({ row }) => h('span', {}, `${row.original.successRate}%`) | ||
| } | ||
| ] | ||
|
|
||
| // Expanded eval table columns | ||
| const evalColumns: TableColumn<EvalResultItem>[] = [ | ||
| { | ||
| accessorKey: 'evalPath', | ||
| header: 'Evaluation' | ||
| }, | ||
| { | ||
| id: 'score', | ||
| header: 'Score', | ||
| meta: { | ||
| class: { | ||
| th: 'text-center', | ||
| td: 'text-center' | ||
| } | ||
| }, | ||
| cell: ({ row }) => h(UBadge, { | ||
| color: row.original.result.success ? 'success' : 'error', | ||
| variant: 'subtle' | ||
| }, () => row.original.result.success ? 'Pass' : 'Fail') | ||
| }, | ||
| { | ||
| id: 'duration', | ||
| header: 'Duration', | ||
| meta: { | ||
| class: { | ||
| th: 'text-right', | ||
| td: 'text-right' | ||
| } | ||
| }, | ||
| cell: ({ row }) => h('span', {}, formatDuration(row.original.result.duration)) | ||
| } | ||
| ] | ||
| </script> | ||
|
|
||
| <template> | ||
| <div v-if="page && rawData"> | ||
| <UPageHero | ||
| :title="page.title" | ||
| :description="page.description" | ||
| :ui="{ | ||
| title: 'text-4xl sm:text-5xl lg:text-6xl font-bold', | ||
| description: 'max-w-2xl mx-auto text-pretty', | ||
| links: 'items-center' | ||
| }" | ||
| > | ||
| <template #links> | ||
| <UButton | ||
| :to="page.githubUrl" | ||
| icon="i-simple-icons-github" | ||
| label="View on GitHub" | ||
| target="_blank" | ||
| color="neutral" | ||
| variant="ghost" | ||
| /> | ||
|
|
||
| <USeparator orientation="vertical" class="h-6" /> | ||
|
|
||
| <span class="text-sm font-medium">Last run date: <span class="text-muted font-normal">{{ formattedDate }}</span></span> | ||
| </template> | ||
| </UPageHero> | ||
|
|
||
| <UPageBody class="mt-0"> | ||
| <UContainer class="max-w-6xl"> | ||
| <div class="flex items-center justify-between mb-4"> | ||
| <h2 class="text-2xl font-bold"> | ||
| Agent Performance Results | ||
| </h2> | ||
|
|
||
| <USelectMenu | ||
| v-model="selectedAgents" | ||
| :items="agents" | ||
| multiple | ||
| placeholder="All Agents" | ||
| color="neutral" | ||
| variant="subtle" | ||
| class="w-52 bg-elevated/50 hover:bg-elevated data-[state=open]:bg-elevated group" | ||
| :ui="{ trailingIcon: 'group-data-[state=open]:rotate-180 transition-transform duration-200' }" | ||
| /> | ||
| </div> | ||
|
|
||
| <UTable | ||
| v-model:expanded="expanded" | ||
| :data="filteredResults" | ||
| :columns="columns" | ||
| :ui="{ | ||
| thead: '[&>tr]:bg-elevated/50 border-b border-default', | ||
| tr: 'py-2.5 peer peer-data-[expanded=true]:[&>td]:p-4! group', | ||
| td: 'py-2.5' | ||
| }" | ||
| class="flex-1 border border-default rounded-lg" | ||
| @select="onSelect" | ||
| > | ||
| <template #expanded="{ row }"> | ||
| <UTable | ||
| :data="row.original.evals" | ||
| :columns="evalColumns" | ||
| :ui="{ | ||
| thead: '[&>tr]:bg-elevated/50 border-b border-default', | ||
| tr: 'py-2.5', | ||
| td: 'py-2.5' | ||
| }" | ||
| class="flex-1 border border-default rounded-lg" | ||
| /> | ||
| </template> | ||
| </UTable> | ||
| </UContainer> | ||
| </UPageBody> | ||
| </div> | ||
| </template> | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| title: AI Agent Evaluations | ||
| description: "Performance results of AI coding agents on Nuxt code generation tasks, measuring success rate and execution time." | ||
| githubUrl: "https://github.com/vercel/nuxt-evals" |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.