diff --git a/docker-compose.override.yml b/docker-compose.override.yml
new file mode 100644
index 00000000..208e9b3b
--- /dev/null
+++ b/docker-compose.override.yml
@@ -0,0 +1,20 @@
+# Worktree-specific overrides (auto-generated by create-worktree.sh)
+# Container names and ports must be unique per worktree
+# Worktree: evoskill-benchmark-harness
+# Generated: 2026-03-12T06:25:29Z
+
+services:
+  dev:
+    container_name: evoskill-benchmark-harness-dev-1
+    environment:
+      - DEV_PORT=3811
+
+  test:
+    container_name: evoskill-benchmark-harness-test-1
+    ports:
+      - "3812:3000"
+
+  orchestrator:
+    container_name: evoskill-benchmark-harness-orchestrator-1
+    ports:
+      - "3813:3000"
diff --git a/packages/cli/src/commands/benchmark/evoskill.ts b/packages/cli/src/commands/benchmark/evoskill.ts
new file mode 100644
index 00000000..2d28ff8e
--- /dev/null
+++ b/packages/cli/src/commands/benchmark/evoskill.ts
@@ -0,0 +1,226 @@
+// EvoSkill benchmark CLI entry point — SMI-3275
+// Docker-first: docker exec skillsmith-dev-1 npm run benchmark:evoskill -- --benchmark officeqa
+
+import { Command } from 'commander'
+import chalk from 'chalk'
+import * as fs from 'fs'
+import * as path from 'path'
+import {
+  runHarness,
+  createBaselineSelector,
+  createCuratedSelector,
+  createSearchSelector,
+  createRecommendSelector,
+  createIterativeSelector,
+  getScorerForBenchmark,
+  generateMarkdownReport,
+  generateJsonReport,
+  CONDITIONS,
+  EVOSKILL_DEFAULTS,
+  type ConditionConfig,
+  type HarnessConfig,
+  type AgentClient,
+  type LlmJudgeClient,
+  type HarnessProgressEvent,
+  type SkillSelectorFn,
+} from '@skillsmith/core'
+
+type BenchmarkName = 'officeqa' | 'sealqa' | 'browsecomp'
+
+interface EvoskillOptions {
+  benchmark: string
+  condition: string
+  seeds: string
+  sample: string
+  output: string
+  datasetDir: string
+  dryRun: boolean
+  model: string
+}
+
+export function createEvoskillBenchmarkCommand(): Command {
+  return new Command('evoskill')
+    .description('Run EvoSkill benchmark evaluation harness')
+    .option('-b, --benchmark <name>', 'Benchmark: officeqa, sealqa, browsecomp, all', 'all')
+    .option('-c, --condition <ids>', 'Condition IDs: 1-9, all (comma-separated)', 'all')
+    .option('-s, --seeds <n>', 'Number of seeds', '3')
+    .option('--sample <fraction>', 'Sample fraction of test set (0-1)', '1.0')
+    .option('-o, --output <dir>', 'Output directory', '/app/results/evoskill/')
+    .option('-d, --dataset-dir <dir>', 'Base directory for dataset files', '/app/data/')
+    .option('--dry-run', 'Validate config without API calls', false)
+    .option('-m, --model <id>', 'Agent model ID', EVOSKILL_DEFAULTS.AGENT_MODEL_ID)
+    .action(async (opts: EvoskillOptions) => {
+      try {
+        await runEvoskillBenchmark(opts)
+      } catch (error) {
+        console.error(chalk.red('Error:'), error instanceof Error ? error.message : error)
+        process.exit(1)
+      }
+    })
+}
+
+async function runEvoskillBenchmark(opts: EvoskillOptions): Promise<void> {
+  const benchmarks = parseBenchmarks(opts.benchmark)
+  const conditionIds = parseConditions(opts.condition)
+  const seeds = parseSeeds(opts.seeds)
+  const sampleFraction = parseFloat(opts.sample)
+  const modelId = opts.model
+
+  console.log(chalk.bold('EvoSkill Benchmark Harness'))
+  console.log(`  Benchmarks: ${benchmarks.join(', ')}`)
+  console.log(`  Conditions: ${conditionIds.join(', ')}`)
+  console.log(`  Seeds: ${seeds.join(', ')}`)
+  console.log(`  Sample: ${(sampleFraction * 100).toFixed(0)}%`)
+  console.log(`  Model: ${modelId}`)
+  console.log(`  Dry run: ${opts.dryRun}`)
+  console.log()
+
+  // Build condition configs
+  const conditions = buildConditions(conditionIds, modelId, seeds)
+
+  // Build harness config
+  const config: HarnessConfig = {
+    benchmarks,
+    conditions,
+    seeds,
+    sampleFraction,
+    datasetDir: opts.datasetDir,
+    outputDir: opts.output,
+    dryRun: opts.dryRun,
+  }
+
+  // Create dependencies (agent client placeholder — real implementation uses Anthropic SDK)
+  const agentClient = createAgentClient()
+  const judgeClient = createJudgeClient()
+
+  const result = await runHarness(config, {
+    agentClient,
+    getScorer: (benchmark) => getScorerForBenchmark(
+      benchmark === 'officeqa' ? 'officeqa' : benchmark === 'browsecomp' ? 'browsecomp' : 'sealqa',
+      EVOSKILL_DEFAULTS.JUDGE_MODEL_ID,
+      benchmark !== 'officeqa' ? judgeClient : undefined
+    ),
+    readFile: async (filePath: string) => fs.readFileSync(filePath, 'utf-8'),
+  }, (event: HarnessProgressEvent) => {
+    switch (event.type) {
+      case 'seed_start':
+        console.log(chalk.cyan(`[seed=${event.seed}] Starting ${event.benchmark}...`))
+        break
+      case 'condition_complete':
+        if (event.result) {
+          const acc = (event.result.accuracy * 100).toFixed(1)
+          console.log(
+            chalk.green(`  [${event.condition}] accuracy=${acc}% cost=$${event.result.costDollars.toFixed(2)}`)
+          )
+        }
+        break
+      case 'harness_complete':
+        console.log(chalk.bold('\nHarness complete.'))
+        break
+    }
+  })
+
+  // Write outputs
+  fs.mkdirSync(opts.output, { recursive: true })
+
+  const mdReport = generateMarkdownReport(result)
+  const mdPath = path.join(opts.output, 'report.md')
+  fs.writeFileSync(mdPath, mdReport)
+  console.log(`Markdown report: ${mdPath}`)
+
+  const jsonReport = generateJsonReport(result)
+  const jsonPath = path.join(opts.output, 'results.json')
+  fs.writeFileSync(jsonPath, jsonReport)
+  console.log(`JSON results: ${jsonPath}`)
+
+  console.log(`\nTotal wall clock: ${(result.wallClockMs / 1000).toFixed(1)}s`)
+}
+
+function parseBenchmarks(input: string): BenchmarkName[] {
+  if (input === 'all') return ['officeqa', 'sealqa', 'browsecomp']
+  const names = input.split(',').map((s) => s.trim()) as BenchmarkName[]
+  for (const name of names) {
+    if (!['officeqa', 'sealqa', 'browsecomp'].includes(name)) {
+      throw new Error(`Unknown benchmark: ${name}`)
+    }
+  }
+  return names
+}
+
+function parseConditions(input: string): number[] {
+  if (input === 'all') return [1, 2, 3, 4, 5, 6, 7, 8, 9]
+  return input.split(',').map((s) => {
+    const n = parseInt(s.trim(), 10)
+    if (isNaN(n) || n < 1 || n > 9) throw new Error(`Invalid condition: ${s}`)
+    return n
+  })
+}
+
+function parseSeeds(input: string): number[] {
+  const n = parseInt(input, 10)
+  if (isNaN(n) || n < 1) throw new Error(`Invalid seeds: ${input}`)
+  return Array.from({ length: n }, (_, i) => EVOSKILL_DEFAULTS.SEED + i)
+}
+
+function buildConditions(ids: number[], modelId: string, seeds: number[]): ConditionConfig[] {
+  const configs: ConditionConfig[] = []
+
+  for (const id of ids) {
+    const name = CONDITIONS[id as keyof typeof CONDITIONS]
+
+    let selectorFn: SkillSelectorFn
+    switch (id) {
+      case 1: selectorFn = createBaselineSelector(); break
+      case 3: selectorFn = createSearchSelector({ search: async () => [] }); break
+      case 4: selectorFn = createRecommendSelector({ recommend: async () => [] }); break
+      case 7: selectorFn = createIterativeSelector(); break
+      case 9: selectorFn = createCuratedSelector([]); break
+      case 2:
+      case 5:
+      case 6:
+      case 8:
+        throw new Error(
+          `Condition ${id} (${name}) requires runtime dependencies not yet configured. ` +
+          `Condition 2 needs --evolved-skill path, 5 needs TransformationService, ` +
+          `6 needs SkillCreateRunner, 8 needs search client + evolve function.`
+        )
+      default:
+        throw new Error(`Unknown condition ID: ${id}`)
+    }
+
+    for (const seed of seeds) {
+      configs.push({
+        name: `${name} (seed=${seed})`,
+        skillSelector: selectorFn,
+        modelId,
+        seed,
+      })
+    }
+  }
+
+  return configs
+}
+
+/** Placeholder agent client — replace with real Anthropic SDK calls */
+function createAgentClient(): AgentClient {
+  return {
+    async runTask() {
+      throw new Error(
+        'AgentClient not configured. Set ANTHROPIC_API_KEY and provide a real implementation.'
+      )
+    },
+  }
+}
+
+/** Placeholder judge client — replace with real Anthropic SDK calls */
+function createJudgeClient(): LlmJudgeClient {
+  return {
+    async judge() {
+      throw new Error(
+        'LlmJudgeClient not configured. Set ANTHROPIC_API_KEY and provide a real implementation.'
+      )
+    },
+  }
+}
+
+export default createEvoskillBenchmarkCommand
diff --git a/packages/cli/src/commands/index.ts b/packages/cli/src/commands/index.ts
index f7143c06..ac510c1b 100644
--- a/packages/cli/src/commands/index.ts
+++ b/packages/cli/src/commands/index.ts
@@ -52,3 +52,6 @@ export { createAuditCommand } from './audit.js'
 
 // SMI-3083: Embedded skill scaffolding
 export { createCreateCommand, createSkill, validateSkillName } from './create.js'
+
+// SMI-3275: EvoSkill Benchmark CLI
+export { createEvoskillBenchmarkCommand } from './benchmark/evoskill.js'
diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts
index a34bfafd..b1dbb70f 100644
--- a/packages/cli/src/index.ts
+++ b/packages/cli/src/index.ts
@@ -43,6 +43,7 @@ import {
   createUnpinCommand,
   createAuditCommand,
   createCreateCommand,
+  createEvoskillBenchmarkCommand,
 } from './commands/index.js'
 import { DEFAULT_DB_PATH } from './config.js'
 import { sanitizeError } from './utils/sanitize.js'
@@ -161,4 +162,9 @@ program.addCommand(createAuditCommand())
 // SMI-3083: Embedded skill scaffolding (also available as `sklx create`)
 program.addCommand(createCreateCommand())
 
+// SMI-3275: EvoSkill Benchmark Harness
+const benchmarkGroup = new Command('benchmark').description('Performance benchmark commands')
+benchmarkGroup.addCommand(createEvoskillBenchmarkCommand())
+program.addCommand(benchmarkGroup)
+
 program.parse()
diff --git a/packages/core/src/benchmarks/evoskill/agent-runner.ts b/packages/core/src/benchmarks/evoskill/agent-runner.ts
new file mode 100644
index 00000000..12b3ed37
--- /dev/null
+++ b/packages/core/src/benchmarks/evoskill/agent-runner.ts
@@ -0,0 +1,176 @@
+// EvoSkill agent runner — SMI-3271
+// Executes benchmark tasks via Claude API with exponential backoff
+
+import type { BenchmarkTask } from './types.js'
+import { EVOSKILL_DEFAULTS } from './types.js'
+
+/** Token usage for a single task execution */
+export interface TaskTokenUsage {
+  inputTokens: number
+  outputTokens: number
+}
+
+/** Result of running a single task */
+export interface TaskResult {
+  taskId: string
+  predicted: string
+  tokens: TaskTokenUsage
+  durationMs: number
+  error?: string
+}
+
+/** Client interface for Claude API calls — injected to avoid SDK dependency in core */
+export interface AgentClient {
+  runTask(params: {
+    model: string
+    systemPrompt: string
+    userMessage: string
+    maxTokens: number
+    temperature: number
+    timeoutMs: number
+  }): Promise<{
+    content: string
+    inputTokens: number
+    outputTokens: number
+  }>
+}
+
+export interface AgentRunnerConfig {
+  client: AgentClient
+  modelId: string
+  skills: string[]
+  timeoutMs?: number
+}
+
+/**
+ * Run a single benchmark task through the agent.
+ * Skills are injected as system prompt prefix.
+ */
+export async function runEvoSkillTask(
+  task: BenchmarkTask,
+  config: AgentRunnerConfig
+): Promise<TaskResult> {
+  const { client, modelId, skills, timeoutMs = EVOSKILL_DEFAULTS.TASK_TIMEOUT_MS } = config
+  const start = Date.now()
+
+  const systemPrompt = buildSystemPrompt(skills)
+
+  try {
+    const response = await callWithRetry(
+      () =>
+        client.runTask({
+          model: modelId,
+          systemPrompt,
+          userMessage: task.question,
+          maxTokens: 1024,
+          temperature: 0,
+          timeoutMs,
+        }),
+      EVOSKILL_DEFAULTS.RETRY_DELAYS
+    )
+
+    return {
+      taskId: task.id,
+      predicted: response.content.trim(),
+      tokens: {
+        inputTokens: response.inputTokens,
+        outputTokens: response.outputTokens,
+      },
+      durationMs: Date.now() - start,
+    }
+  } catch (err) {
+    return {
+      taskId: task.id,
+      predicted: '',
+      tokens: { inputTokens: 0, outputTokens: 0 },
+      durationMs: Date.now() - start,
+      error: err instanceof Error ? err.message : String(err),
+    }
+  }
+}
+
+/** Run all tasks in a batch, sequentially to respect rate limits */
+export async function runEvoSkillBatch(
+  tasks: BenchmarkTask[],
+  config: AgentRunnerConfig,
+  onProgress?: (completed: number, total: number) => void
+): Promise<TaskResult[]> {
+  const results: TaskResult[] = []
+
+  for (let i = 0; i < tasks.length; i++) {
+    const result = await runEvoSkillTask(tasks[i], config)
+    results.push(result)
+    onProgress?.(i + 1, tasks.length)
+  }
+
+  return results
+}
+
+/** Build system prompt from skill contents */
+function buildSystemPrompt(skills: string[]): string {
+  if (skills.length === 0) {
+    return 'Answer the question concisely and accurately.'
+  }
+
+  const skillBlock = skills
+    .map((s, i) => `<skill index="${i + 1}">\n${s}\n</skill>`)
+    .join('\n\n')
+
+  return `You have the following skills available. Use them to answer the question concisely and accurately.\n\n${skillBlock}`
+}
+
+/** Call with exponential backoff on rate limit (429) errors */
+async function callWithRetry<T>(
+  fn: () => Promise<T>,
+  delays: readonly number[]
+): Promise<T> {
+  let lastError: Error | undefined
+
+  // First attempt (no delay)
+  try {
+    return await fn()
+  } catch (err) {
+    if (!isRateLimitError(err)) throw err
+    lastError = err instanceof Error ? err : new Error(String(err))
+  }
+
+  // Retry attempts with exponential backoff
+  for (const delay of delays) {
+    await sleep(delay)
+    try {
+      return await fn()
+    } catch (err) {
+      if (!isRateLimitError(err)) throw err
+      lastError = err instanceof Error ? err : new Error(String(err))
+    }
+  }
+
+  throw lastError ?? new Error('All retries exhausted')
+}
+
+function isRateLimitError(err: unknown): boolean {
+  if (err instanceof Error) {
+    return err.message.includes('429') || err.message.toLowerCase().includes('rate limit')
+  }
+  return false
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms))
+}
+
+/** Calculate cost in dollars from token counts */
+export function calculateCost(
+  tokens: TaskTokenUsage,
+  modelId: string
+): number {
+  const pricing = MODEL_PRICING[modelId] ?? MODEL_PRICING['default']
+  return (tokens.inputTokens * pricing.inputPerToken) + (tokens.outputTokens * pricing.outputPerToken)
+}
+
+/** Per-token pricing (dollars) — updated for current models */
+const MODEL_PRICING: Record<string, { inputPerToken: number; outputPerToken: number }> = {
+  'claude-sonnet-4-6': { inputPerToken: 3e-6, outputPerToken: 15e-6 },
+  'claude-opus-4-6': { inputPerToken: 15e-6, outputPerToken: 75e-6 },
+  'default': { inputPerToken: 3e-6, outputPerToken: 15e-6 },
+}
diff --git a/packages/core/src/benchmarks/evoskill/dataset-loader.ts b/packages/core/src/benchmarks/evoskill/dataset-loader.ts
new file mode 100644
index 00000000..79a788b9
--- /dev/null
+++ b/packages/core/src/benchmarks/evoskill/dataset-loader.ts
@@ -0,0 +1,185 @@
+// EvoSkill dataset loader — SMI-3269
+// Parses EvoSkill CSV (DABStep, SEAL-QA) and BrowseComp JSON
+// Applies train/val/test splits with configurable seed
+
+import type { BenchmarkTask } from './types.js'
+import { EVOSKILL_DEFAULTS } from './types.js'
+
+/** Seeded PRNG (Mulberry32) for deterministic shuffles */
+function mulberry32(seed: number): () => number {
+  let s = seed | 0
+  return () => {
+    s = (s + 0x6d2b79f5) | 0
+    let t = Math.imul(s ^ (s >>> 15), 1 | s)
+    t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t
+    return ((t ^ (t >>> 14)) >>> 0) / 4294967296
+  }
+}
+
+/** Fisher-Yates shuffle with seeded PRNG */
+function seededShuffle<T>(arr: T[], seed: number): T[] {
+  const result = [...arr]
+  const rng = mulberry32(seed)
+  for (let i = result.length - 1; i > 0; i--) {
+    const j = Math.floor(rng() * (i + 1))
+    ;[result[i], result[j]] = [result[j], result[i]]
+  }
+  return result
+}
+
+/** Parse a CSV line, handling quoted fields with commas */
+function parseCSVLine(line: string): string[] {
+  const fields: string[] = []
+  let current = ''
+  let inQuotes = false
+
+  for (let i = 0; i < line.length; i++) {
+    const ch = line[i]
+    if (ch === '"') {
+      if (inQuotes && i + 1 < line.length && line[i + 1] === '"') {
+        current += '"'
+        i++
+      } else {
+        inQuotes = !inQuotes
+      }
+    } else if (ch === ',' && !inQuotes) {
+      fields.push(current)
+      current = ''
+    } else {
+      current += ch
+    }
+  }
+  fields.push(current)
+  return fields
+}
+
+export interface DatasetLoadResult {
+  tasks: BenchmarkTask[]
+  train: BenchmarkTask[]
+  val: BenchmarkTask[]
+  test: BenchmarkTask[]
+}
+
+/**
+ * Load a CSV dataset (OfficeQA / SEAL-QA format).
+ * Expected columns: question, answer (ground truth).
+ * Column order detected from header row.
+ */
+export function loadCSVDataset(
+  csvContent: string,
+  benchmark: 'officeqa' | 'sealqa',
+  options: { seed?: number; trainRatio?: number; valRatio?: number } = {}
+): DatasetLoadResult {
+  const lines = csvContent.split('\n').filter((l) => l.trim().length > 0)
+  if (lines.length < 2) {
+    throw new Error(`Dataset ${benchmark} has fewer than 2 lines (no data rows)`)
+  }
+
+  const header = parseCSVLine(lines[0]).map((h) => h.trim().toLowerCase())
+  const qIdx = header.indexOf('question')
+  const aIdx = header.findIndex((h) => h === 'answer' || h === 'ground_truth' || h === 'groundtruth')
+
+  if (qIdx === -1 || aIdx === -1) {
+    throw new Error(
+      `Dataset ${benchmark} missing required columns. Found: ${header.join(', ')}. Need: question, answer/ground_truth`
+    )
+  }
+
+  const tasks: BenchmarkTask[] = []
+  for (let i = 1; i < lines.length; i++) {
+    const fields = parseCSVLine(lines[i])
+    if (fields.length <= Math.max(qIdx, aIdx)) continue
+
+    tasks.push({
+      id: `${benchmark}-${i}`,
+      question: fields[qIdx].trim(),
+      groundTruth: fields[aIdx].trim(),
+      split: 'test', // placeholder — assigned below
+      benchmark,
+    })
+  }
+
+  return splitDataset(tasks, benchmark, options)
+}
+
+/**
+ * Load BrowseComp JSON dataset.
+ * Expected format: array of { question: string, answer: string }
+ */
+export function loadJSONDataset(
+  jsonContent: string,
+  benchmark: 'browsecomp',
+  options: { seed?: number; trainRatio?: number; valRatio?: number } = {}
+): DatasetLoadResult {
+  const parsed: unknown = JSON.parse(jsonContent)
+  if (!Array.isArray(parsed) || parsed.length === 0) {
+    throw new Error(
+      `Dataset ${benchmark} is empty or not an array (got ${typeof parsed})`
+    )
+  }
+
+  const tasks: BenchmarkTask[] = []
+  for (let i = 0; i < parsed.length; i++) {
+    const item = parsed[i] as Record<string, unknown>
+    if (typeof item?.question !== 'string' || typeof item?.answer !== 'string') {
+      throw new Error(
+        `Dataset ${benchmark} item ${i} missing required string fields: question=${typeof item?.question}, answer=${typeof item?.answer}`
+      )
+    }
+    if (!item.question.trim() || !item.answer.trim()) {
+      throw new Error(`Dataset ${benchmark} item ${i} has empty question or answer`)
+    }
+    tasks.push({
+      id: `${benchmark}-${i + 1}`,
+      question: item.question,
+      groundTruth: item.answer,
+      split: 'test' as const,
+      benchmark,
+    })
+  }
+
+  return splitDataset(tasks, benchmark, options)
+}
+
+/** Apply train/val/test split with seeded shuffle */
+function splitDataset(
+  tasks: BenchmarkTask[],
+  benchmark: string,
+  options: { seed?: number; trainRatio?: number; valRatio?: number } = {}
+): DatasetLoadResult {
+  const seed = options.seed ?? EVOSKILL_DEFAULTS.SEED
+  const trainRatio = options.trainRatio ?? EVOSKILL_DEFAULTS.TRAIN_RATIO
+  const valRatio = options.valRatio ?? EVOSKILL_DEFAULTS.VAL_RATIO
+
+  if (trainRatio + valRatio >= 1) {
+    throw new Error(`train + val ratios must be < 1 (got ${trainRatio} + ${valRatio})`)
+  }
+
+  const shuffled = seededShuffle(tasks, seed)
+  const n = shuffled.length
+  const trainEnd = Math.round(n * trainRatio)
+  const valEnd = Math.round(n * (trainRatio + valRatio))
+
+  const train = shuffled.slice(0, trainEnd).map((t) => ({ ...t, split: 'train' as const }))
+  const val = shuffled.slice(trainEnd, valEnd).map((t) => ({ ...t, split: 'val' as const }))
+  const test = shuffled.slice(valEnd).map((t) => ({ ...t, split: 'test' as const }))
+
+  if (test.length === 0) {
+    throw new Error(`Dataset ${benchmark} has 0 test tasks after split (${n} total)`)
+  }
+
+  const allTasks = [...train, ...val, ...test]
+  return { tasks: allTasks, train, val, test }
+}
+
+/** Load dataset from file content, auto-detecting format */
+export function loadDataset(
+  content: string,
+  benchmark: 'officeqa' | 'sealqa' | 'browsecomp',
+  options: { seed?: number; trainRatio?: number; valRatio?: number } = {}
+): DatasetLoadResult {
+  if (benchmark === 'browsecomp') {
+    return loadJSONDataset(content, benchmark, options)
+  }
+  return loadCSVDataset(content, benchmark, options)
+}
diff --git a/packages/core/src/benchmarks/evoskill/evaluator.ts b/packages/core/src/benchmarks/evoskill/evaluator.ts
new file mode 100644
index 00000000..2c0a1ecb
--- /dev/null
+++ b/packages/core/src/benchmarks/evoskill/evaluator.ts
@@ -0,0 +1,141 @@
+// EvoSkill evaluator — SMI-3272
+// Scores predictions, aggregates results, computes IR metrics
+
+import type { BenchmarkTask, EvoSkillBenchmarkResult, ScorerFn } from './types.js'
+import type { TaskResult } from './agent-runner.js'
+import { calculateCost } from './agent-runner.js'
+import { ndcg, mrr, mapAtK } from './ir-metrics.js'
+
+export interface EvaluatorConfig {
+  scorer: ScorerFn
+  condition: string
+  benchmark: string
+  split: string
+  modelId: string
+  /** Whether to compute IR metrics (for retrieval conditions 3-4) */
+  computeIrMetrics?: boolean
+  /** Ranked skill IDs for IR metrics (ordered by relevance) */
+  rankedSkillIds?: string[]
+  /** Relevant skill IDs (ground truth) for IR metrics */
+  relevantSkillIds?: Set<string>
+  /** Score threshold for counting as correct (default: 0.5) */
+  scoreThreshold?: number
+}
+
+/**
+ * Evaluate task results and produce an aggregate benchmark result.
+ */
+export async function evaluate(
+  tasks: BenchmarkTask[],
+  results: TaskResult[],
+  config: EvaluatorConfig
+): Promise<EvoSkillBenchmarkResult> {
+  const { scorer, condition, benchmark, split, modelId, computeIrMetrics, scoreThreshold = 0.5 } = config
+
+  // Build task map for lookup
+  const taskMap = new Map(tasks.map((t) => [t.id, t]))
+
+  let correctCount = 0
+  let totalInputTokens = 0
+  let totalOutputTokens = 0
+  let totalDurationMs = 0
+
+  for (const result of results) {
+    const task = taskMap.get(result.taskId)
+    if (!task) continue
+
+    if (!result.error && result.predicted) {
+      const score = await scorer(task.question, result.predicted, task.groundTruth)
+      if (score >= scoreThreshold) correctCount++
+    }
+
+    totalInputTokens += result.tokens.inputTokens
+    totalOutputTokens += result.tokens.outputTokens
+    totalDurationMs += result.durationMs
+  }
+
+  const taskCount = results.length
+  const accuracy = taskCount > 0 ? correctCount / taskCount : 0
+  const costDollars = calculateCost(
+    { inputTokens: totalInputTokens, outputTokens: totalOutputTokens },
+    modelId
+  )
+
+  const evalResult: EvoSkillBenchmarkResult = {
+    condition,
+    benchmark,
+    split,
+    accuracy,
+    taskCount,
+    correctCount,
+    costTokens: totalInputTokens + totalOutputTokens,
+    costDollars,
+    wallClockMs: totalDurationMs,
+  }
+
+  // IR metrics for retrieval conditions
+  if (computeIrMetrics && config.rankedSkillIds && config.relevantSkillIds) {
+    const ranked = config.rankedSkillIds
+    const relevant = config.relevantSkillIds
+    evalResult.irMetrics = {
+      ndcg5: ndcg(ranked, new Map([...relevant].map((id) => [id, 1])), 5),
+      mrr: mrr(ranked, relevant),
+      map5: mapAtK(ranked, relevant, 5),
+    }
+  }
+
+  return evalResult
+}
+
+/**
+ * Aggregate multiple seed runs into a single result with mean ± std.
+ */
+export function aggregateSeeds(
+  results: EvoSkillBenchmarkResult[]
+): EvoSkillBenchmarkResult {
+  if (results.length === 0) {
+    throw new Error('Cannot aggregate 0 results')
+  }
+
+  if (results.length === 1) {
+    // Single seed: accuracyStd stays undefined
+    return { ...results[0] }
+  }
+
+  const accuracies = results.map((r) => r.accuracy)
+  const meanAccuracy = accuracies.reduce((a, b) => a + b, 0) / accuracies.length
+  const variance =
+    accuracies.reduce((sum, a) => sum + (a - meanAccuracy) ** 2, 0) / (accuracies.length - 1)
+  const std = Math.sqrt(variance)
+
+  const totalCostTokens = results.reduce((s, r) => s + r.costTokens, 0)
+  const totalCostDollars = results.reduce((s, r) => s + r.costDollars, 0)
+  const totalWallClock = results.reduce((s, r) => s + r.wallClockMs, 0)
+  const totalTasks = results.reduce((s, r) => s + r.taskCount, 0)
+  const totalCorrect = results.reduce((s, r) => s + r.correctCount, 0)
+
+  // Average IR metrics across seeds if present
+  let irMetrics: EvoSkillBenchmarkResult['irMetrics']
+  const withIr = results.filter((r) => r.irMetrics)
+  if (withIr.length > 0) {
+    irMetrics = {
+      ndcg5: withIr.reduce((s, r) => s + r.irMetrics!.ndcg5, 0) / withIr.length,
+      mrr: withIr.reduce((s, r) => s + r.irMetrics!.mrr, 0) / withIr.length,
+      map5: withIr.reduce((s, r) => s + r.irMetrics!.map5, 0) / withIr.length,
+    }
+  }
+
+  return {
+    condition: results[0].condition,
+    benchmark: results[0].benchmark,
+    split: results[0].split,
+    accuracy: meanAccuracy,
+    accuracyStd: std,
+    taskCount: totalTasks,
+    correctCount: totalCorrect,
+    costTokens: totalCostTokens,
+    costDollars: totalCostDollars,
+    wallClockMs: totalWallClock,
+    irMetrics,
+  }
+}
diff --git a/packages/core/src/benchmarks/evoskill/harness.ts b/packages/core/src/benchmarks/evoskill/harness.ts
new file mode 100644
index 00000000..4ec6b8ae
--- /dev/null
+++ b/packages/core/src/benchmarks/evoskill/harness.ts
@@ -0,0 +1,182 @@
+// EvoSkill benchmark harness orchestrator — SMI-3273
+// Coordinates dataset loading, skill selection, agent execution, and evaluation
+// Parallelism: conditions concurrent per seed; seeds serial
+
+import type { BenchmarkTask, ConditionConfig, EvoSkillBenchmarkResult, HarnessConfig } from './types.js'
+import type { AgentClient, TaskResult } from './agent-runner.js'
+import type { ScorerFn } from './types.js'
+import * as pathModule from 'path'
+import { loadDataset } from './dataset-loader.js'
+import { runEvoSkillBatch } from './agent-runner.js'
+import { evaluate, aggregateSeeds } from './evaluator.js'
+
+/** Progress callback for harness execution */
+export type HarnessProgressFn = (event: HarnessProgressEvent) => void
+
+export interface HarnessProgressEvent {
+  type: 'seed_start' | 'condition_start' | 'condition_complete' | 'seed_complete' | 'harness_complete'
+  seed?: number
+  condition?: string
+  benchmark?: string
+  result?: EvoSkillBenchmarkResult
+  progress?: { completed: number; total: number }
+}
+
+/** Dependencies injected from CLI layer */
+export interface HarnessDependencies {
+  agentClient: AgentClient
+  /** Scorer per benchmark — each benchmark may need a different scorer */
+  getScorer: (benchmark: 'officeqa' | 'sealqa' | 'browsecomp') => ScorerFn
+  /** Read file content from path */
+  readFile: (path: string) => Promise<string>
+}
+
+export interface HarnessResult {
+  results: EvoSkillBenchmarkResult[]
+  aggregated: EvoSkillBenchmarkResult[]
+  wallClockMs: number
+}
+
+/**
+ * Run the full benchmark harness.
+ * Seeds run serially; conditions within each seed run concurrently.
+ */
+export async function runHarness(
+  config: HarnessConfig,
+  deps: HarnessDependencies,
+  onProgress?: HarnessProgressFn
+): Promise<HarnessResult> {
+  const harnessStart = Date.now()
+  const allResults: EvoSkillBenchmarkResult[] = []
+
+  for (const benchmark of config.benchmarks) {
+    // Load raw dataset content once per benchmark
+    const datasetPath = pathModule.join(config.datasetDir, getDatasetPath(benchmark))
+    const datasetContent = await deps.readFile(datasetPath)
+
+    for (const seed of config.seeds) {
+      onProgress?.({ type: 'seed_start', seed, benchmark })
+
+      // Re-split dataset with this seed for different train/val/test shuffle
+      const dataset = loadDataset(datasetContent, benchmark, { seed })
+
+      // Use test split (or sample fraction thereof)
+      let testTasks = dataset.test
+      if (config.sampleFraction < 1) {
+        const sampleSize = Math.max(1, Math.round(testTasks.length * config.sampleFraction))
+        testTasks = testTasks.slice(0, sampleSize)
+      }
+
+      // Run conditions concurrently within this seed
+      const conditionPromises = config.conditions.map(async (condition) => {
+        onProgress?.({ type: 'condition_start', seed, condition: condition.name, benchmark })
+
+        if (config.dryRun) {
+          return createDryRunResult(condition, benchmark, testTasks.length)
+        }
+
+        return runCondition(condition, benchmark, testTasks, seed, deps)
+      })
+
+      const seedResults = await Promise.all(conditionPromises)
+
+      for (const result of seedResults) {
+        allResults.push(result)
+        onProgress?.({
+          type: 'condition_complete',
+          seed,
+          condition: result.condition,
+          benchmark,
+          result,
+        })
+      }
+
+      onProgress?.({ type: 'seed_complete', seed, benchmark })
+    }
+  }
+
+  // Aggregate across seeds per (condition, benchmark) pair
+  const aggregated = aggregateResults(allResults)
+
+  onProgress?.({ type: 'harness_complete' })
+
+  return {
+    results: allResults,
+    aggregated,
+    wallClockMs: Date.now() - harnessStart,
+  }
+}
+
+/** Run a single condition on a benchmark's test tasks */
+async function runCondition(
+  condition: ConditionConfig,
+  benchmark: string,
+  testTasks: BenchmarkTask[],
+  seed: number,
+  deps: HarnessDependencies
+): Promise<EvoSkillBenchmarkResult> {
+  // Select skills
+  const skills = await condition.skillSelector(testTasks)
+
+  // Run tasks through agent
+  const taskResults: TaskResult[] = await runEvoSkillBatch(testTasks, {
+    client: deps.agentClient,
+    modelId: condition.modelId,
+    skills,
+  })
+
+  // Evaluate with benchmark-specific scorer
+  const scorer = deps.getScorer(benchmark as 'officeqa' | 'sealqa' | 'browsecomp')
+  return evaluate(testTasks, taskResults, {
+    scorer,
+    condition: condition.name,
+    benchmark,
+    split: 'test',
+    modelId: condition.modelId,
+  })
+}
+
+/** Create a placeholder result for dry-run mode */
+function createDryRunResult(
+  condition: ConditionConfig,
+  benchmark: string,
+  taskCount: number
+): EvoSkillBenchmarkResult {
+  return {
+    condition: condition.name,
+    benchmark,
+    split: 'test',
+    accuracy: 0,
+    taskCount,
+    correctCount: 0,
+    costTokens: 0,
+    costDollars: 0,
+    wallClockMs: 0,
+  }
+}
+
+/** Aggregate results across seeds for each (condition, benchmark) pair */
+function aggregateResults(results: EvoSkillBenchmarkResult[]): EvoSkillBenchmarkResult[] {
+  const groups = new Map<string, EvoSkillBenchmarkResult[]>()
+
+  for (const r of results) {
+    const key = `${r.condition}:${r.benchmark}`
+    const group = groups.get(key) ?? []
+    group.push(r)
+    groups.set(key, group)
+  }
+
+  return [...groups.values()].map(aggregateSeeds)
+}
+
+/** Dataset file paths (relative to data directory) */
+function getDatasetPath(benchmark: 'officeqa' | 'sealqa' | 'browsecomp'): string {
+  switch (benchmark) {
+    case 'officeqa':
+      return 'datasets/dabstep/dabstep.csv'
+    case 'sealqa':
+      return 'datasets/sealqa/sealqa.csv'
+    case 'browsecomp':
+      return 'datasets/browsecomp/browsecomp.json'
+  }
+}
diff --git a/packages/core/src/benchmarks/evoskill/index.ts b/packages/core/src/benchmarks/evoskill/index.ts
new file mode 100644
index 00000000..67b10095
--- /dev/null
+++ b/packages/core/src/benchmarks/evoskill/index.ts
@@ -0,0 +1,84 @@
+// EvoSkill benchmark module barrel export
+
+export { ndcg, mrr, mapAtK, precisionAtK, recallAtK } from './ir-metrics.js'
+
+export {
+  exactMatchScorer,
+  createLlmJudgeScorer,
+  getScorerForBenchmark,
+  type LlmJudgeClient,
+} from './scorers.js'
+
+export type {
+  BenchmarkTask,
+  ConditionConfig,
+  EvoSkillBenchmarkResult,
+  ScorerFn,
+  HarnessConfig,
+} from './types.js'
+
+export { EVOSKILL_DEFAULTS } from './types.js'
+
+// Dataset loader
+export {
+  loadDataset,
+  loadCSVDataset,
+  loadJSONDataset,
+  type DatasetLoadResult,
+} from './dataset-loader.js'
+
+// Skill selector
+export {
+  createBaselineSelector,
+  createEvoSkillEvolvedSelector,
+  createSearchSelector,
+  createRecommendSelector,
+  createOptimizedSelector,
+  createSkillCreateSelector,
+  createIterativeSelector,
+  createHybridSelector,
+  createCuratedSelector,
+  NotImplementedError,
+  CONDITIONS,
+  type ConditionNumber,
+  type ConditionName,
+  type SkillSelectorFn,
+  type SkillsmithSearchClient,
+  type SkillsmithRecommendClient,
+  type TransformationService,
+  type SkillCreateRunner,
+} from './skill-selector.js'
+
+// Agent runner
+export {
+  runEvoSkillTask,
+  runEvoSkillBatch,
+  calculateCost,
+  type AgentClient,
+  type AgentRunnerConfig,
+  type TaskResult,
+  type TaskTokenUsage,
+} from './agent-runner.js'
+
+// Evaluator
+export {
+  evaluate,
+  aggregateSeeds,
+  type EvaluatorConfig,
+} from './evaluator.js'
+
+// Harness orchestrator
+export {
+  runHarness,
+  type HarnessDependencies,
+  type HarnessResult,
+  type HarnessProgressFn,
+  type HarnessProgressEvent,
+} from './harness.js'
+
+// Report generator
+export {
+  generateMarkdownReport,
+  generateJsonReport,
+  type ReportOptions,
+} from './report.js'
diff --git a/packages/core/src/benchmarks/evoskill/ir-metrics.ts b/packages/core/src/benchmarks/evoskill/ir-metrics.ts
new file mode 100644
index 00000000..8c0ab360
--- /dev/null
+++ b/packages/core/src/benchmarks/evoskill/ir-metrics.ts
@@ -0,0 +1,112 @@
+// IR metrics for EvoSkill benchmark evaluation
+// Implements nDCG, MRR, MAP, Precision@k, and Recall@k
+
+/**
+ * Discounted Cumulative Gain at position k.
+ * Uses the standard log2(i+1) discount factor.
+ */
+function dcgAtK(ranked: string[], relevance: Map<string, number>, k: number): number {
+  let dcg = 0
+  const limit = Math.min(k, ranked.length)
+  for (let i = 0; i < limit; i++) {
+    const rel = relevance.get(ranked[i]) ?? 0
+    dcg += rel / Math.log2(i + 2) // i+2 because log2(1) = 0
+  }
+  return dcg
+}
+
+/**
+ * Normalized Discounted Cumulative Gain at position k.
+ * Measures ranking quality with graded relevance.
+ * Returns 0 if no relevant items exist.
+ */
+export function ndcg(ranked: string[], relevance: Map<string, number>, k: number): number {
+  if (ranked.length === 0 || relevance.size === 0) return 0
+
+  const actual = dcgAtK(ranked, relevance, k)
+
+  // Ideal ranking: sort all items by relevance descending
+  const idealRanked = [...relevance.entries()]
+    .sort((a, b) => b[1] - a[1])
+    .map(([id]) => id)
+
+  const ideal = dcgAtK(idealRanked, relevance, k)
+  if (ideal === 0) return 0
+
+  return actual / ideal
+}
+
+/**
+ * Mean Reciprocal Rank.
+ * Returns 1/rank of the first relevant result, or 0 if none found.
+ */
+export function mrr(ranked: string[], relevant: Set<string>): number {
+  if (ranked.length === 0 || relevant.size === 0) return 0
+
+  for (let i = 0; i < ranked.length; i++) {
+    if (relevant.has(ranked[i])) {
+      return 1 / (i + 1)
+    }
+  }
+  return 0
+}
+
+/**
+ * Mean Average Precision at k.
+ * Computes average precision over positions up to k.
+ */
+export function mapAtK(ranked: string[], relevant: Set<string>, k: number): number {
+  if (ranked.length === 0 || relevant.size === 0) return 0
+
+  let hits = 0
+  let sumPrecision = 0
+  const limit = Math.min(k, ranked.length)
+
+  for (let i = 0; i < limit; i++) {
+    if (relevant.has(ranked[i])) {
+      hits++
+      sumPrecision += hits / (i + 1)
+    }
+  }
+
+  // Normalize by total relevant items (not k)
+  return hits > 0 ? sumPrecision / relevant.size : 0
+}
+
+/**
+ * Precision at k.
+ * Fraction of top-k results that are relevant.
+ */
+export function precisionAtK(ranked: string[], relevant: Set<string>, k: number): number {
+  if (ranked.length === 0 || relevant.size === 0) return 0
+
+  const limit = Math.min(k, ranked.length)
+  let hits = 0
+
+  for (let i = 0; i < limit; i++) {
+    if (relevant.has(ranked[i])) {
+      hits++
+    }
+  }
+
+  return hits / limit
+}
+
+/**
+ * Recall at k.
+ * Fraction of relevant items found in top-k results.
+ */
+export function recallAtK(ranked: string[], relevant: Set<string>, k: number): number {
+  if (ranked.length === 0 || relevant.size === 0) return 0
+
+  const limit = Math.min(k, ranked.length)
+  let hits = 0
+
+  for (let i = 0; i < limit; i++) {
+    if (relevant.has(ranked[i])) {
+      hits++
+    }
+  }
+
+  return hits / relevant.size
+}
diff --git a/packages/core/src/benchmarks/evoskill/report.ts b/packages/core/src/benchmarks/evoskill/report.ts
new file mode 100644
index 00000000..4fc7e3e8
--- /dev/null
+++ b/packages/core/src/benchmarks/evoskill/report.ts
@@ -0,0 +1,179 @@
+// EvoSkill report generator — SMI-3274
+// Self-specified schema: markdown tables + JSON export
+
+import type { EvoSkillBenchmarkResult } from './types.js'
+import type { HarnessResult } from './harness.js'
+
+export interface ReportOptions {
+  title?: string
+  includeRawResults?: boolean
+  includePareto?: boolean
+}
+
+/**
+ * Generate markdown comparison table.
+ * Columns: Condition | OfficeQA Acc | SEAL-QA Acc | BrowseComp Acc | Cost ($) | Time (s)
+ */
+export function generateMarkdownReport(
+  harnessResult: HarnessResult,
+  options: ReportOptions = {}
+): string {
+  const { title = 'EvoSkill Benchmark Results', includePareto = true } = options
+  const { aggregated, wallClockMs } = harnessResult
+
+  const lines: string[] = []
+  lines.push(`# ${title}`)
+  lines.push('')
+  lines.push(`Total wall clock: ${(wallClockMs / 1000).toFixed(1)}s`)
+  lines.push('')
+
+  // Main comparison table
+  lines.push('## Comparison Table')
+  lines.push('')
+  lines.push('| Condition | OfficeQA Acc | SEAL-QA Acc | BrowseComp Acc | Cost ($) | Time (s) |')
+  lines.push('|-----------|-------------|------------|----------------|----------|----------|')
+
+  const conditions = [...new Set(aggregated.map((r) => r.condition))]
+  const benchmarks: Array<'officeqa' | 'sealqa' | 'browsecomp'> = ['officeqa', 'sealqa', 'browsecomp']
+
+  for (const cond of conditions) {
+    const cells = [cond]
+
+    let totalCost = 0
+    let totalTime = 0
+
+    for (const bm of benchmarks) {
+      const result = aggregated.find((r) => r.condition === cond && r.benchmark === bm)
+      if (result) {
+        cells.push(formatAccuracy(result.accuracy, result.accuracyStd))
+        totalCost += result.costDollars
+        totalTime += result.wallClockMs / 1000
+      } else {
+        cells.push('—')
+      }
+    }
+
+    cells.push(`$${totalCost.toFixed(2)}`)
+    cells.push(totalTime.toFixed(1))
+    lines.push(`| ${cells.join(' | ')} |`)
+  }
+
+  lines.push('')
+
+  // Pareto frontier
+  if (includePareto) {
+    lines.push('## Pareto Frontier (Accuracy vs Cost)')
+    lines.push('')
+    const paretoPoints = computeParetoFrontier(aggregated)
+    if (paretoPoints.length > 0) {
+      lines.push('| Condition | Benchmark | Accuracy | Cost ($) | Pareto-Optimal |')
+      lines.push('|-----------|-----------|----------|----------|----------------|')
+      for (const p of paretoPoints) {
+        const isOptimal = p.isPareto ? 'Yes' : ''
+        lines.push(
+          `| ${p.condition} | ${p.benchmark} | ${(p.accuracy * 100).toFixed(1)}% | $${p.cost.toFixed(2)} | ${isOptimal} |`
+        )
+      }
+      lines.push('')
+    }
+  }
+
+  // IR metrics table (if any results have them)
+  const withIr = aggregated.filter((r) => r.irMetrics)
+  if (withIr.length > 0) {
+    lines.push('## IR Metrics (Retrieval Conditions)')
+    lines.push('')
+    lines.push('| Condition | Benchmark | nDCG@5 | MRR | MAP@5 |')
+    lines.push('|-----------|-----------|--------|-----|-------|')
+    for (const r of withIr) {
+      const ir = r.irMetrics!
+      lines.push(
+        `| ${r.condition} | ${r.benchmark} | ${ir.ndcg5.toFixed(3)} | ${ir.mrr.toFixed(3)} | ${ir.map5.toFixed(3)} |`
+      )
+    }
+    lines.push('')
+  }
+
+  return lines.join('\n')
+}
+
+/** Generate JSON report */
+export function generateJsonReport(harnessResult: HarnessResult): string {
+  const output = {
+    generatedAt: new Date().toISOString(),
+    wallClockMs: harnessResult.wallClockMs,
+    aggregated: harnessResult.aggregated.map(serializeResult),
+    results: harnessResult.results.map(serializeResult),
+    paretoFrontier: computeParetoFrontier(harnessResult.aggregated)
+      .filter((p) => p.isPareto)
+      .map((p) => ({ condition: p.condition, benchmark: p.benchmark, accuracy: p.accuracy, cost: p.cost })),
+  }
+  return JSON.stringify(output, null, 2)
+}
+
+/** Format accuracy as percentage with optional std */
+function formatAccuracy(accuracy: number, std?: number): string {
+  const pct = (accuracy * 100).toFixed(1)
+  if (std === undefined) return `${pct}%`
+  return `${pct} ± ${(std * 100).toFixed(1)}%`
+}
+
+/** Serialize result for JSON (omit undefined fields) */
+function serializeResult(r: EvoSkillBenchmarkResult): Record<string, unknown> {
+  const obj: Record<string, unknown> = {
+    condition: r.condition,
+    benchmark: r.benchmark,
+    split: r.split,
+    accuracy: r.accuracy,
+    taskCount: r.taskCount,
+    correctCount: r.correctCount,
+    costTokens: r.costTokens,
+    costDollars: r.costDollars,
+    wallClockMs: r.wallClockMs,
+  }
+  if (r.accuracyStd !== undefined) obj.accuracyStd = r.accuracyStd
+  if (r.irMetrics) obj.irMetrics = r.irMetrics
+  return obj
+}
+
+interface ParetoPoint {
+  condition: string
+  benchmark: string
+  accuracy: number
+  cost: number
+  isPareto: boolean
+}
+
+/** Compute Pareto frontier: no other point dominates on both accuracy AND cost */
+function computeParetoFrontier(results: EvoSkillBenchmarkResult[]): ParetoPoint[] {
+  const points: ParetoPoint[] = results.map((r) => ({
+    condition: r.condition,
+    benchmark: r.benchmark,
+    accuracy: r.accuracy,
+    cost: r.costDollars,
+    isPareto: false,
+  }))
+
+  // Group by benchmark for per-benchmark Pareto
+  const byBenchmark = new Map<string, ParetoPoint[]>()
+  for (const p of points) {
+    const group = byBenchmark.get(p.benchmark) ?? []
+    group.push(p)
+    byBenchmark.set(p.benchmark, group)
+  }
+
+  for (const group of byBenchmark.values()) {
+    for (const p of group) {
+      // A point is Pareto-optimal if no other point has >= accuracy AND <= cost
+      p.isPareto = !group.some(
+        (other) =>
+          other !== p &&
+          other.accuracy >= p.accuracy &&
+          other.cost <= p.cost &&
+          (other.accuracy > p.accuracy || other.cost < p.cost)
+      )
+    }
+  }
+
+  return points
+}
diff --git a/packages/core/src/benchmarks/evoskill/scorers.ts b/packages/core/src/benchmarks/evoskill/scorers.ts
new file mode 100644
index 00000000..90f3e87f
--- /dev/null
+++ b/packages/core/src/benchmarks/evoskill/scorers.ts
@@ -0,0 +1,141 @@
+// Scorer implementations for EvoSkill benchmarks
+// Multi-tolerance exact-match (OfficeQA/DABStep) and LLM-judge (SEAL-QA)
+
+import type { ScorerFn } from './types.js'
+
+/** LLM judge client interface — injected to avoid SDK dependency in core */
+export interface LlmJudgeClient {
+  judge(params: {
+    model: string
+    question: string
+    predicted: string
+    groundTruth: string
+  }): Promise<number>
+}
+
+/**
+ * Normalize a string for comparison:
+ * - lowercase
+ * - strip leading/trailing whitespace
+ * - remove trailing punctuation (., !, ?)
+ */
+function normalize(s: string): string {
+  return s
+    .trim()
+    .toLowerCase()
+    .replace(/^["']+|["']+$/g, '') // strip surrounding quotes
+    .trim()
+    .replace(/[.!?]+$/, '')
+}
+
+/**
+ * Check if two numeric strings are within tolerance.
+ * Returns true if both parse as numbers and |a - b| <= tolerance.
+ */
+function numericMatch(a: string, b: string, tolerance = 0.01): boolean {
+  const numA = parseFloat(a)
+  const numB = parseFloat(b)
+  if (isNaN(numA) || isNaN(numB)) return false
+  return Math.abs(numA - numB) <= tolerance
+}
+
+/**
+ * Generate variations of a string for matching:
+ * - Original normalized
+ * - Without units (strip trailing alphabetic suffix)
+ * - Without commas (e.g., "1,000" → "1000")
+ * - Without percentage sign
+ */
+function variations(s: string): string[] {
+  const norm = normalize(s)
+  const result = [norm]
+
+  // Without trailing units (e.g., "42 kg" → "42")
+  const withoutUnits = norm.replace(/\s+[a-z%]+$/, '')
+  if (withoutUnits !== norm) result.push(withoutUnits)
+
+  // Without commas
+  const withoutCommas = norm.replace(/,/g, '')
+  if (withoutCommas !== norm) result.push(withoutCommas)
+
+  // Without percentage
+  const withoutPercent = norm.replace(/%$/, '')
+  if (withoutPercent !== norm) result.push(withoutPercent)
+
+  return result
+}
+
+/**
+ * Multi-tolerance exact-match scorer for OfficeQA/DABStep.
+ * Handles:
+ * - Case-insensitive comparison
+ * - Trailing punctuation removal
+ * - With/without units
+ * - Numeric tolerance (±0.01)
+ * - Comma-separated alternatives in ground truth
+ *
+ * Returns 1.0 if any variation matches, 0.0 otherwise.
+ */
+export const exactMatchScorer: ScorerFn = (_question, predicted, groundTruth) => {
+  const predVariations = variations(predicted)
+
+  // Ground truth may contain comma-space-separated alternatives
+  // Use ', ' (not bare ',') to avoid splitting numbers like '1,000'
+  const truthAlternatives = groundTruth.split(', ').map((s) => s.trim())
+
+  for (const truth of truthAlternatives) {
+    const truthVariations = variations(truth)
+
+    // Check exact match between any variation pair
+    for (const pv of predVariations) {
+      for (const tv of truthVariations) {
+        if (pv === tv) return 1.0
+      }
+    }
+
+    // Check numeric match
+    for (const pv of predVariations) {
+      for (const tv of truthVariations) {
+        if (numericMatch(pv, tv)) return 1.0
+      }
+    }
+  }
+
+  return 0.0
+}
+
+/**
+ * LLM-judge scorer for SEAL-QA.
+ * Accepts an injected LlmJudgeClient to avoid @anthropic-ai/sdk dependency in core.
+ * The CLI package provides the concrete implementation.
+ *
+ * Judge model is pinned via JUDGE_MODEL_ID constant — never the agent model.
+ * Returns a score 0.0–1.0.
+ */
+export function createLlmJudgeScorer(client: LlmJudgeClient, judgeModelId: string): ScorerFn {
+  return async (question: string, predicted: string, groundTruth: string) => {
+    const score = await client.judge({ model: judgeModelId, question, predicted, groundTruth })
+    return Math.max(0, Math.min(1, score))
+  }
+}
+
+/**
+ * Get the appropriate scorer for a benchmark.
+ * For LLM-judged benchmarks, requires an injected LlmJudgeClient.
+ */
+export function getScorerForBenchmark(
+  benchmark: 'officeqa' | 'sealqa' | 'browsecomp',
+  judgeModelId: string,
+  llmClient?: LlmJudgeClient
+): ScorerFn {
+  switch (benchmark) {
+    case 'officeqa':
+      return exactMatchScorer
+    case 'sealqa':
+    case 'browsecomp':
+      if (!llmClient) {
+        throw new Error(`LLM judge client required for ${benchmark} benchmark`)
+      }
+      return createLlmJudgeScorer(llmClient, judgeModelId)
+  }
+}
diff --git a/packages/core/src/benchmarks/evoskill/skill-selector.ts b/packages/core/src/benchmarks/evoskill/skill-selector.ts
new file mode 100644
index 00000000..d24909c0
--- /dev/null
+++ b/packages/core/src/benchmarks/evoskill/skill-selector.ts
@@ -0,0 +1,181 @@
+// EvoSkill skill selector — SMI-3270
+// Conditions 1–6, 8–9; condition 7 throws NotImplementedError (Study B)
+
+import type { BenchmarkTask } from './types.js'
+
+/** Skill selector: given tasks, returns skill content strings to inject */
+export type SkillSelectorFn = (tasks: BenchmarkTask[]) => Promise<string[]>
+
+/** Dependency interfaces for conditions 5-6 (injected from CLI layer) */
+export interface TransformationService {
+  optimize(skillContent: string, tasks: BenchmarkTask[]): Promise<string>
+}
+
+export interface SkillCreateRunner {
+  create(description: string): Promise<string>
+}
+
+export interface SkillsmithSearchClient {
+  search(query: string, limit?: number): Promise<Array<{ content: string; score: number }>>
+}
+
+export interface SkillsmithRecommendClient {
+  recommend(context: string, limit?: number): Promise<Array<{ content: string; score: number }>>
+}
+
+/** Condition 1: Baseline — empty skill set */
+export function createBaselineSelector(): SkillSelectorFn {
+  return async () => []
+}
+
+/** Condition 2: EvoSkill-Evolved — load pre-evolved skill from file */
+export function createEvoSkillEvolvedSelector(evolvedSkillPath: string): SkillSelectorFn {
+  // Validate path at construction time — no traversal allowed
+  if (evolvedSkillPath.includes('..')) {
+    throw new Error(`Evolved skill path must not contain '..': ${evolvedSkillPath}`)
+  }
+  return async () => {
+    const fs = await import('fs/promises')
+    const content = await fs.readFile(evolvedSkillPath, 'utf-8')
+    return [content]
+  }
+}
+
+/** Condition 3: Skillsmith-Search — best skill from registry search */
+export function createSearchSelector(client: SkillsmithSearchClient): SkillSelectorFn {
+  return async (tasks: BenchmarkTask[]) => {
+    // Derive query from task benchmark + representative questions
+    const benchmark = tasks[0]?.benchmark ?? 'general'
+    const sampleQuestions = tasks
+      .slice(0, 3)
+      .map((t) => t.question)
+      .join('; ')
+    const query = `${benchmark} benchmark: ${sampleQuestions}`
+
+    const results = await client.search(query, 5)
+    if (results.length === 0) return []
+    return [results[0].content]
+  }
+}
+
+/** Condition 4: Skillsmith-Recommend — best skill from recommendations */
+export function createRecommendSelector(client: SkillsmithRecommendClient): SkillSelectorFn {
+  return async (tasks: BenchmarkTask[]) => {
+    const benchmark = tasks[0]?.benchmark ?? 'general'
+    const context = `Solving ${benchmark} benchmark tasks requiring data analysis and reasoning`
+
+    const results = await client.recommend(context, 5)
+    if (results.length === 0) return []
+    return [results[0].content]
+  }
+}
+
+/** Condition 5: Skillsmith-Optimized — search + optimize with TransformationService */
+export function createOptimizedSelector(
+  searchClient: SkillsmithSearchClient,
+  transformService: TransformationService
+): SkillSelectorFn {
+  return async (tasks: BenchmarkTask[]) => {
+    const searchSelector = createSearchSelector(searchClient)
+    const skills = await searchSelector(tasks)
+    if (skills.length === 0) return []
+
+    const optimized = await transformService.optimize(skills[0], tasks)
+    return [optimized]
+  }
+}
+
+/** Condition 6: Skillsmith-Create — generate skill via CLI runner */
+export function createSkillCreateSelector(runner: SkillCreateRunner): SkillSelectorFn {
+  return async (tasks: BenchmarkTask[]) => {
+    const benchmark = tasks[0]?.benchmark ?? 'general'
+    const sampleQuestions = tasks
+      .slice(0, 5)
+      .map((t) => t.question)
+      .join('\n')
+    const description = `A skill for solving ${benchmark} benchmark tasks. Example tasks:\n${sampleQuestions}`
+
+    const content = await runner.create(description)
+    return [content]
+  }
+}
+
+/** Condition 7: Skillsmith-Iterative — uses IterativeEvaluator from Study B */
+export function createIterativeSelector(params: {
+  iterativeEvaluator: IterativeEvaluatorInstance
+  baselineSkillContent: string
+  skillId: string
+  trainTasks: BenchmarkTask[]
+  valTasks: BenchmarkTask[]
+}): SkillSelectorFn {
+  return async () => {
+    const result = await params.iterativeEvaluator.run(
+      params.baselineSkillContent,
+      params.skillId,
+      params.trainTasks.map((t) => ({ id: t.id, question: t.question, groundTruth: t.groundTruth })),
+      params.valTasks.map((t) => ({ id: t.id, question: t.question, groundTruth: t.groundTruth })),
+      [] // test tasks handled by harness, not the selector
+    )
+    if (result.finalFrontier.length === 0) return []
+    // Return best frontier variant's skill content
+    const best = result.finalFrontier.reduce((a, b) => (a.accuracy >= b.accuracy ? a : b))
+    return [best.variant.content]
+  }
+}
+
+/** IterativeEvaluator interface to avoid circular imports */
+interface IterativeEvaluatorInstance {
+  run(
+    baselineContent: string,
+    skillId: string,
+    trainTasks: Array<{ id: string; question: string; groundTruth: string }>,
+    valTasks: Array<{ id: string; question: string; groundTruth: string }>,
+    testTasks: Array<{ id: string; question: string; groundTruth: string }>
+  ): Promise<{
+    finalFrontier: Array<{ variant: { content: string }; accuracy: number }>
+  }>
+}
+
+/** Condition 8: Hybrid — Skillsmith search → EvoSkill evolution */
+export function createHybridSelector(
+  searchClient: SkillsmithSearchClient,
+  evolveSkill: (baseSkill: string, tasks: BenchmarkTask[]) => Promise<string>
+): SkillSelectorFn {
+  return async (tasks: BenchmarkTask[]) => {
+    const searchSelector = createSearchSelector(searchClient)
+    const skills = await searchSelector(tasks)
+    if (skills.length === 0) return []
+
+    const evolved = await evolveSkill(skills[0], tasks)
+    return [evolved]
+  }
+}
+
+/** Condition 9: Skillsmith-Curated — hand-picked skill IDs */
+export function createCuratedSelector(skillContents: string[]): SkillSelectorFn {
+  return async () => skillContents
+}
+
+/** Error for unimplemented conditions */
+export class NotImplementedError extends Error {
+  constructor(message: string) {
+    super(message)
+    this.name = 'NotImplementedError'
+  }
+}
+
+/** Registry of all condition factories */
+export const CONDITIONS = {
+  1: 'baseline',
+  2: 'evoskill-evolved',
+  3: 'skillsmith-search',
+  4: 'skillsmith-recommend',
+  5: 'skillsmith-optimized',
+  6: 'skillsmith-create',
+  7: 'skillsmith-iterative',
+  8: 'hybrid',
+  9: 'skillsmith-curated',
+} as const
+
+export type ConditionNumber = keyof typeof CONDITIONS
+export type ConditionName = (typeof CONDITIONS)[ConditionNumber]
diff --git a/packages/core/src/benchmarks/evoskill/types.ts b/packages/core/src/benchmarks/evoskill/types.ts
new file mode 100644
index 00000000..1f389d76
--- /dev/null
+++ b/packages/core/src/benchmarks/evoskill/types.ts
@@ -0,0 +1,88 @@
+// EvoSkill benchmark types
+// Named EvoSkillBenchmarkResult to avoid collision with core BenchmarkResult
+
+export interface BenchmarkTask {
+  id: string
+  question: string
+  groundTruth: string
+  split: 'train' | 'val' | 'test'
+  benchmark: 'officeqa' | 'sealqa' | 'browsecomp'
+}
+
+/**
+ * ConditionConfig.skillSelector implementations for Conditions 5 (Skillsmith-Optimized)
+ * and 6 (Skillsmith-Create) require injected service instances (TransformationService,
+ * CLI runner). Do not implement these as pure functions — pass dependencies via a
+ * factory or closure over injected services before registering the selector.
+ */
+export interface ConditionConfig {
+  name: string
+  skillSelector: (tasks: BenchmarkTask[]) => Promise<string[]>
+  /** Model ID for the agent under test */
+  modelId: string
+  /** Controls dataset split shuffle; temperature stays 0 for determinism */
+  seed: number
+}
+
+/**
+ * Named EvoSkillBenchmarkResult to avoid collision with core BenchmarkResult type.
+ * Both are exported from @skillsmith/core; identical names would cause ambiguous imports.
+ */
+export interface EvoSkillBenchmarkResult {
+  condition: string
+  benchmark: string
+  split: string
+  accuracy: number
+  taskCount: number
+  correctCount: number
+  costTokens: number
+  costDollars: number
+  wallClockMs: number
+  /** Undefined for single-seed runs (Opus ablation); omit from JSON, render as "n/a" in markdown */
+  accuracyStd?: number
+  irMetrics?: {
+    ndcg5: number
+    mrr: number
+    map5: number
+  }
+}
+
+/** Scorer function signature: returns 0.0–1.0 */
+export type ScorerFn = (
+  question: string,
+  predicted: string,
+  groundTruth: string
+) => number | Promise<number>
+
+/** Configuration for the benchmark harness */
+export interface HarnessConfig {
+  benchmarks: Array<'officeqa' | 'sealqa' | 'browsecomp'>
+  conditions: ConditionConfig[]
+  seeds: number[]
+  /** Fraction of test set to use (0-1, default 1.0) */
+  sampleFraction: number
+  /** Base directory for dataset files (absolute path) */
+  datasetDir: string
+  /** Output directory for results */
+  outputDir: string
+  /** Dry run — validate config without executing API calls */
+  dryRun: boolean
+}
+
+/** Harness constants */
+export const EVOSKILL_DEFAULTS = {
+  /** EvoSkill's default seed for dataset splits */
+  SEED: 42,
+  /** Default split ratios matching EvoSkill */
+  TRAIN_RATIO: 0.18,
+  VAL_RATIO: 0.12,
+  TEST_RATIO: 0.7,
+  /** Judge model for LLM-scored benchmarks (always Sonnet, never the agent model) */
+  JUDGE_MODEL_ID: 'claude-sonnet-4-6',
+  /** Default agent model */
+  AGENT_MODEL_ID: 'claude-sonnet-4-6',
+  /** Retry delays for rate-limited API calls (ms) */
+  RETRY_DELAYS: [1000, 2000, 4000] as const,
+  /** Per-task timeout in ms */
+  TASK_TIMEOUT_MS: 120_000,
+} as const
diff --git a/packages/core/src/benchmarks/index.ts b/packages/core/src/benchmarks/index.ts
index 820eb423..1d089293 100644
--- a/packages/core/src/benchmarks/index.ts
+++ b/packages/core/src/benchmarks/index.ts
@@ -93,6 +93,75 @@ export {
   validateEmbeddingResults,
 } from './embeddingBenchmark.js'
 
+// EvoSkill benchmark evaluation
+export {
+  // IR metrics
+  ndcg,
+  mrr,
+  mapAtK,
+  precisionAtK,
+  recallAtK,
+  // Scorers
+  exactMatchScorer,
+  createLlmJudgeScorer,
+  getScorerForBenchmark,
+  // Constants
+  EVOSKILL_DEFAULTS,
+  CONDITIONS,
+  NotImplementedError,
+  // Dataset loader
+  loadDataset,
+  loadCSVDataset,
+  loadJSONDataset,
+  // Skill selectors
+  createBaselineSelector,
+  createEvoSkillEvolvedSelector,
+  createSearchSelector,
+  createRecommendSelector,
+  createOptimizedSelector,
+  createSkillCreateSelector,
+  createIterativeSelector,
+  createHybridSelector,
+  createCuratedSelector,
+  // Agent runner
+  runEvoSkillTask,
+  runEvoSkillBatch,
+  calculateCost,
+  // Evaluator
+  evaluate,
+  aggregateSeeds,
+  // Harness
+  runHarness,
+  // Report
+  generateMarkdownReport,
+  generateJsonReport,
+  // Types
+  type BenchmarkTask,
+  type ConditionConfig,
+  type EvoSkillBenchmarkResult,
+  type ScorerFn,
+  type HarnessConfig,
+  type LlmJudgeClient,
+  type DatasetLoadResult,
+  type SkillSelectorFn,
+  type SkillsmithSearchClient,
+  type SkillsmithRecommendClient,
+  type TransformationService,
+  type SkillCreateRunner,
+  type ConditionNumber,
+  type ConditionName,
+  type AgentClient,
+  type AgentRunnerConfig,
+  type TaskResult,
+  type TaskTokenUsage,
+  type EvaluatorConfig,
+  type HarnessDependencies,
+  type HarnessResult,
+  type HarnessProgressFn,
+  type HarnessProgressEvent,
+  type ReportOptions,
+} from './evoskill/index.js'
+
 // SMI-677: Shared statistical utilities
 export {
   percentile,
diff --git a/packages/core/src/db/migrations/v11-benchmark-evaluator.ts b/packages/core/src/db/migrations/v11-benchmark-evaluator.ts
new file mode 100644
index 00000000..35b9c953
--- /dev/null
+++ b/packages/core/src/db/migrations/v11-benchmark-evaluator.ts
@@ -0,0 +1,77 @@
+/**
+ * @fileoverview Migration v11 — EvoSkill benchmark evaluator tables
+ * @module @skillsmith/core/db/migrations/v11-benchmark-evaluator
+ * @see Plan: docs/internal/implementation/evoskill-task-accuracy-evaluator.md
+ *
+ * Adds three tables for Study B (Task-Accuracy Evaluator):
+ *  - benchmark_results: evaluation results across conditions/benchmarks/splits/seeds
+ *  - skill_variants: skill variants generated during iterative evaluation
+ *  - failure_patterns: categorized failure patterns per evaluation
+ *
+ * SCHEMA_VERSION reserved: 11 (Study B — evoskill-task-accuracy-evaluator branch)
+ */
+export const MIGRATION_V11_SQL = `
+CREATE TABLE IF NOT EXISTS benchmark_results (
+  id TEXT PRIMARY KEY,
+  skill_id TEXT NOT NULL,
+  skill_variant_hash TEXT NOT NULL,
+  benchmark TEXT NOT NULL CHECK (benchmark IN ('officeqa', 'sealqa', 'browsecomp')),
+  split TEXT NOT NULL CHECK (split IN ('train', 'val', 'test')),
+  condition TEXT NOT NULL,
+  iteration INTEGER DEFAULT 0,
+  accuracy REAL NOT NULL CHECK (accuracy >= 0 AND accuracy <= 1),
+  task_count INTEGER NOT NULL,
+  correct_count INTEGER NOT NULL CHECK (correct_count >= 0 AND correct_count <= task_count),
+  cost_tokens INTEGER,
+  cost_dollars REAL,
+  wall_clock_ms INTEGER,
+  scorer TEXT NOT NULL CHECK (scorer IN ('exact_match', 'llm_judge')),
+  model_id TEXT NOT NULL,
+  seed INTEGER NOT NULL,
+  created_at TEXT NOT NULL DEFAULT (datetime('now')),
+  FOREIGN KEY (skill_id) REFERENCES skills(id)
+);
+
+CREATE INDEX IF NOT EXISTS idx_benchmark_results_skill
+  ON benchmark_results(skill_id, benchmark, split);
+CREATE INDEX IF NOT EXISTS idx_benchmark_results_condition
+  ON benchmark_results(condition, benchmark);
+
+CREATE TABLE IF NOT EXISTS skill_variants (
+  id TEXT PRIMARY KEY,
+  skill_id TEXT NOT NULL,
+  parent_variant_id TEXT,
+  content_hash TEXT NOT NULL,
+  iteration INTEGER NOT NULL,
+  generation_method TEXT NOT NULL CHECK (
+    generation_method IN ('baseline', 'decompose', 'augment', 'specialize', 'llm_rewrite')
+  ),
+  accuracy_train REAL,
+  accuracy_val REAL,
+  accuracy_test REAL,
+  content_lines INTEGER,
+  cost_tokens INTEGER,
+  is_frontier INTEGER DEFAULT 0 CHECK (is_frontier IN (0, 1)),
+  created_at TEXT NOT NULL DEFAULT (datetime('now')),
+  FOREIGN KEY (skill_id) REFERENCES skills(id),
+  FOREIGN KEY (parent_variant_id) REFERENCES skill_variants(id),
+  UNIQUE (skill_id, content_hash)
+);
+
+CREATE INDEX IF NOT EXISTS idx_skill_variants_frontier
+  ON skill_variants(skill_id, is_frontier)
+  WHERE is_frontier = 1;
+
+CREATE TABLE IF NOT EXISTS failure_patterns (
+  id TEXT PRIMARY KEY,
+  benchmark_result_id TEXT NOT NULL,
+  category TEXT NOT NULL CHECK (
+    category IN ('wrong_format', 'missing_context', 'reasoning_error', 'tool_misuse', 'hallucination')
+  ),
+  frequency INTEGER NOT NULL,
+  example_tasks TEXT,           -- JSON array of task IDs
+  suggested_fix TEXT,
+  created_at TEXT NOT NULL DEFAULT (datetime('now')),
+  FOREIGN KEY (benchmark_result_id) REFERENCES benchmark_results(id)
+);
+`
diff --git a/packages/core/src/db/schema.ts b/packages/core/src/db/schema.ts
index db1a0a86..21ae5af3 100644
--- a/packages/core/src/db/schema.ts
+++ b/packages/core/src/db/schema.ts
@@ -23,11 +23,12 @@ import { MIGRATION_V6_SQL } from './migrations/v6-advisories.js'
 import { MIGRATION_V7_SQL } from './migrations/v7-compatibility.js'
 import { MIGRATION_V8_SQL } from './migrations/v8-co-installs.js'
 import { MIGRATION_V10_SQL } from './migrations/v10-dependencies.js'
+import { MIGRATION_V11_SQL } from './migrations/v11-benchmark-evaluator.js'
 
 export type DatabaseType = Database
 
-// v10 reserved: skill-dependency-intelligence (SMI-3134)
-export const SCHEMA_VERSION = 10
+// v11 reserved: evoskill-task-accuracy-evaluator (Study B — SMI-3284)
+export const SCHEMA_VERSION = 11
 
 /**
  * SQL statements for creating the database schema
@@ -223,6 +224,11 @@ export const MIGRATIONS: Migration[] = [
     description: 'Skill dependency intelligence: skill_dependencies table',
     sql: MIGRATION_V10_SQL,
   },
+  {
+    version: 11,
+    description: 'SMI-3284: EvoSkill benchmark evaluator tables (Study B)',
+    sql: MIGRATION_V11_SQL,
+  },
 ]
 
 /**
diff --git a/packages/core/src/evaluation/FailureAnalyzer.ts b/packages/core/src/evaluation/FailureAnalyzer.ts
new file mode 100644
index 00000000..e3aa4580
--- /dev/null
+++ b/packages/core/src/evaluation/FailureAnalyzer.ts
@@ -0,0 +1,231 @@
+/**
+ * @fileoverview FailureAnalyzer — categorize task failures from evaluations
+ * @module @skillsmith/core/evaluation/FailureAnalyzer
+ * @see SMI-3293, SMI-3294: Heuristic + LLM failure categorization
+ *
+ * Categorizes failures into 5 categories:
+ *  - wrong_format: predicted type doesn't match ground truth type
+ *  - missing_context: agent output signals insufficient information
+ *  - tool_misuse: tool calls failed or no tools used when needed
+ *  - reasoning_error: right type, wrong value (fallback category)
+ *  - hallucination: high confidence + wrong answer (best-effort, least reliable)
+ *
+ * The hallucination category is a best-effort approximation using
+ * detection-by-absence (no hedging language). It will produce false positives.
+ * Do not use hallucination frequency alone to drive variant generation.
+ */
+
+import type {
+  FailureAnalyzerConfig,
+  FailureCategory,
+  FailurePattern,
+  TaskFailure,
+} from './types.js'
+
+/** Templates for suggested fixes per category */
+const SUGGESTED_FIX_TEMPLATES: Record<FailureCategory, string> = {
+  wrong_format:
+    "Add explicit output format instructions: 'Always respond with a single number, no units'",
+  missing_context: "Add context retrieval step: 'Before answering, search for relevant documents'",
+  tool_misuse: "Add tool usage guidance: 'Use the file search tool to find data before reasoning'",
+  reasoning_error:
+    "Add step-by-step reasoning instruction: 'Break the problem into steps before answering'",
+  hallucination: "Add confidence calibration: 'If uncertain, state your confidence level'",
+}
+
+/** Phrases signaling missing context in agent output */
+const MISSING_CONTEXT_PHRASES = [
+  "i don't have enough information",
+  'cannot determine',
+  'not provided',
+  'no information available',
+  'unable to find',
+  'insufficient data',
+  'not enough context',
+  "i'm not sure",
+  'no data available',
+]
+
+/** Hedging phrases that indicate uncertainty (absence → hallucination signal) */
+const HEDGING_PHRASES = [
+  "i'm not sure",
+  'i think',
+  'possibly',
+  'it might be',
+  'approximately',
+  'i believe',
+  'probably',
+  'perhaps',
+  'it seems',
+  'my best guess',
+  'uncertain',
+  'likely',
+  'not confident',
+]
+
+const DEFAULT_MAX_EXAMPLES = 5
+
+export class FailureAnalyzer {
+  private readonly mode: 'heuristic' | 'llm'
+  private readonly maxExamples: number
+
+  constructor(config?: Partial<FailureAnalyzerConfig>) {
+    this.mode = config?.mode ?? 'heuristic'
+    this.maxExamples = config?.maxExamplesPerCategory ?? DEFAULT_MAX_EXAMPLES
+  }
+
+  /**
+   * Analyze a set of task failures and categorize them.
+   * Returns patterns sorted by frequency descending.
+   */
+  analyze(failures: TaskFailure[]): FailurePattern[] {
+    if (failures.length === 0) return []
+
+    if (this.mode === 'llm') {
+      return this.analyzeLlm(failures)
+    }
+
+    return this.analyzeHeuristic(failures)
+  }
+
+  private analyzeHeuristic(failures: TaskFailure[]): FailurePattern[] {
+    const buckets = new Map<FailureCategory, TaskFailure[]>()
+
+    for (const failure of failures) {
+      const category = this.categorize(failure)
+      const list = buckets.get(category) ?? []
+      list.push(failure)
+      buckets.set(category, list)
+    }
+
+    const patterns: FailurePattern[] = []
+    for (const [category, examples] of buckets) {
+      patterns.push({
+        category,
+        frequency: examples.length,
+        examples: examples.slice(0, this.maxExamples),
+        suggestedFix: SUGGESTED_FIX_TEMPLATES[category],
+      })
+    }
+
+    // Sort by frequency descending
+    patterns.sort((a, b) => b.frequency - a.frequency)
+    return patterns
+  }
+
+  /**
+   * LLM mode stub — returns heuristic results with a flag.
+   * Full LLM implementation requires API client injection (Wave 1B optional).
+   */
+  private analyzeLlm(failures: TaskFailure[]): FailurePattern[] {
+    // LLM mode falls back to heuristic for now
+    // Future: send batches of 5 failures to Claude for nuanced categorization
+    return this.analyzeHeuristic(failures)
+  }
+
+  /**
+   * Categorize a single failure using heuristics.
+   * Order matters — earlier checks take priority.
+   */
+  private categorize(failure: TaskFailure): FailureCategory {
+    // 1. Wrong format: type mismatch between predicted and ground truth
+    if (this.isWrongFormat(failure)) {
+      return 'wrong_format'
+    }
+
+    // 2. Missing context: agent signals insufficient information
+    if (this.isMissingContext(failure)) {
+      return 'missing_context'
+    }
+
+    // 3. Tool misuse: tool call failed or no tools used when task needs them
+    if (this.isToolMisuse(failure)) {
+      return 'tool_misuse'
+    }
+
+    // 4. Hallucination: high confidence (no hedging) but wrong answer
+    // Best-effort — least reliable heuristic, detection-by-absence
+    if (this.isHallucination(failure)) {
+      return 'hallucination'
+    }
+
+    // 5. Reasoning error: fallback — right type, wrong value
+    return 'reasoning_error'
+  }
+
+  private isWrongFormat(failure: TaskFailure): boolean {
+    const predicted = failure.predicted.trim()
+    const truth = failure.groundTruth.trim()
+
+    // Check number vs non-number
+    const predIsNum = isNumericString(predicted)
+    const truthIsNum = isNumericString(truth)
+    if (predIsNum !== truthIsNum) return true
+
+    // Check list vs scalar (simple heuristic: comma-separated or newline-separated)
+    const predIsList = isListString(predicted)
+    const truthIsList = isListString(truth)
+    if (predIsList !== truthIsList) return true
+
+    // Check for drastically different length (10x ratio → likely format issue)
+    if (predicted.length > 0 && truth.length > 0) {
+      const ratio = predicted.length / truth.length
+      if (ratio > 10 || ratio < 0.1) return true
+    }
+
+    return false
+  }
+
+  private isMissingContext(failure: TaskFailure): boolean {
+    const output = failure.agentOutput.toLowerCase()
+    return MISSING_CONTEXT_PHRASES.some((phrase) => output.includes(phrase))
+  }
+
+  private isToolMisuse(failure: TaskFailure): boolean {
+    if (failure.toolCallFailed) return true
+
+    // If the task seems to need tools (ground truth references files/data)
+    // but agent used zero tool calls
+    if (failure.toolCallCount === 0) {
+      const output = failure.agentOutput.toLowerCase()
+      const needsTools =
+        output.includes('file') ||
+        output.includes('search') ||
+        output.includes('look up') ||
+        output.includes('database')
+      if (needsTools) return true
+    }
+
+    return false
+  }
+
+  private isHallucination(failure: TaskFailure): boolean {
+    const output = failure.agentOutput.toLowerCase()
+
+    // Must NOT contain hedging language (hallucination = confident + wrong)
+    const hasHedging = HEDGING_PHRASES.some((phrase) => output.includes(phrase))
+    if (hasHedging) return false
+
+    // Must have a substantive answer (not empty/very short)
+    if (output.trim().length < 10) return false
+
+    // Confident and wrong → hallucination signal
+    return true
+  }
+}
+
+/** Check if a string represents a numeric value */
+function isNumericString(s: string): boolean {
+  if (s.length === 0) return false
+  return !isNaN(Number(s.replace(/[,%$€£¥]/g, '').trim()))
+}
+
+/** Check if a string looks like a list (comma-separated or multi-line) */
+function isListString(s: string): boolean {
+  // Multiple comma-separated items
+  if (s.includes(',') && s.split(',').length >= 3) return true
+  // Multiple newline-separated items
+  const lines = s.split('\n').filter((l) => l.trim().length > 0)
+  if (lines.length >= 3) return true
+  return false
+}
diff --git a/packages/core/src/evaluation/IterativeEvaluator.ts b/packages/core/src/evaluation/IterativeEvaluator.ts
new file mode 100644
index 00000000..1758b1cd
--- /dev/null
+++ b/packages/core/src/evaluation/IterativeEvaluator.ts
@@ -0,0 +1,287 @@
+/**
+ * @fileoverview IterativeEvaluator — iterative skill refinement loop
+ * @module @skillsmith/core/evaluation/IterativeEvaluator
+ * @see SMI-3300: Main iteration loop (evaluate → analyze → generate → select)
+ * @see SMI-3301: Cost guard — stop when budget exhausted
+ *
+ * Pre-loop: evaluates baseline skill on val split to seed the frontier.
+ * Loop: train-split evaluation → failure analysis → variant generation →
+ *        val-split evaluation → Pareto selection → early stopping check.
+ * Post-loop: final evaluation on test split (never seen during iteration).
+ */
+
+import { createHash, randomUUID } from 'crypto'
+import { FailureAnalyzer } from './FailureAnalyzer.js'
+import { SkillVariantGenerator } from './SkillVariantGenerator.js'
+import type { RewriteClient } from './SkillVariantGenerator.js'
+import { VariantSelector } from './VariantSelector.js'
+import type {
+  GenerationMethod,
+  ScoredVariant,
+  SkillVariant,
+} from './types.js'
+import type { ScorerFn } from '../benchmarks/evoskill/types.js'
+
+/** Task structure for the evaluator */
+export interface EvalTask {
+  id: string
+  question: string
+  groundTruth: string
+}
+
+/** Agent runner — executes a task with a skill and returns the predicted answer */
+export interface AgentRunner {
+  run(params: { skillContent: string; question: string; modelId: string }): Promise<{
+    predicted: string
+    agentOutput: string
+    costTokens: number
+    toolCallFailed?: boolean
+    toolCallCount?: number
+  }>
+}
+
+/** Configuration for the iterative evaluation loop */
+export interface IterativeConfig {
+  maxIterations: number
+  frontierSize: number
+  generationStrategies: GenerationMethod[]
+  earlyStoppingPatience: number
+  costBudget: number
+  scorer: ScorerFn
+  agentRunner: AgentRunner
+  taskModelId: string
+  rewriteModelId: string
+  rewriteClient?: RewriteClient
+  benchmarkDomain: string
+  seed: number
+}
+
+/** Per-iteration snapshot for convergence tracking */
+export interface IterationSnapshot {
+  iteration: number
+  bestAccuracy: number
+  cost: number
+}
+
+/** Final result of the iterative evaluation */
+export interface IterativeResult {
+  finalFrontier: ScoredVariant[]
+  convergenceCurve: IterationSnapshot[]
+  totalIterations: number
+  totalCost: number
+  earlyStopReason?: string
+  testAccuracy?: number
+}
+
+const DEFAULT_CONFIG: IterativeConfig = {
+  maxIterations: 10,
+  frontierSize: 3,
+  generationStrategies: ['augment', 'decompose'],
+  earlyStoppingPatience: 3,
+  costBudget: 50_000,
+  scorer: () => 0,
+  agentRunner: { run: async () => ({ predicted: '', agentOutput: '', costTokens: 0 }) },
+  taskModelId: 'claude-sonnet-4-6',
+  rewriteModelId: 'claude-sonnet-4-6',
+  benchmarkDomain: 'general',
+  seed: 42,
+}
+
+function contentHash(content: string): string {
+  return createHash('sha256').update(content, 'utf-8').digest('hex')
+}
+
+export class IterativeEvaluator {
+  private readonly config: IterativeConfig
+  private readonly failureAnalyzer: FailureAnalyzer
+  private readonly generator: SkillVariantGenerator
+  private readonly selector: VariantSelector
+  private totalCost = 0
+
+  constructor(config: Partial<IterativeConfig>) {
+    this.config = { ...DEFAULT_CONFIG, ...config }
+    this.failureAnalyzer = new FailureAnalyzer({ mode: 'heuristic' })
+    this.generator = new SkillVariantGenerator({
+      strategies: this.config.generationStrategies,
+      rewriteModelId: this.config.rewriteModelId,
+      rewriteClient: this.config.rewriteClient,
+      benchmarkDomain: this.config.benchmarkDomain,
+    })
+    this.selector = new VariantSelector()
+  }
+
+  /**
+   * Run the iterative evaluation loop.
+   *
+   * @param baselineContent - Initial skill content
+   * @param skillId - Skill identifier
+   * @param trainTasks - Tasks for training evaluation
+   * @param valTasks - Tasks for validation evaluation
+   * @param testTasks - Tasks for final test evaluation (never seen during iteration)
+   */
+  async run(
+    baselineContent: string,
+    skillId: string,
+    trainTasks: EvalTask[],
+    valTasks: EvalTask[],
+    testTasks: EvalTask[]
+  ): Promise<IterativeResult> {
+    const convergenceCurve: IterationSnapshot[] = []
+
+    // Pre-loop: evaluate baseline on val split to seed frontier
+    const baselineVariant: SkillVariant = {
+      id: randomUUID(),
+      contentHash: contentHash(baselineContent),
+      content: baselineContent,
+      parentId: null,
+      skillId,
+      iteration: 0,
+      generationMethod: 'baseline',
+      contentLines: baselineContent.split('\n').length,
+      costTokens: 0,
+    }
+
+    const baselineScored = await this.evaluateVariant(baselineVariant, valTasks)
+    let frontier: ScoredVariant[] = [baselineScored]
+    let bestAccuracy = baselineScored.accuracy
+    let stagnantIterations = 0
+
+    this.log(0, bestAccuracy, frontier.length)
+    convergenceCurve.push({ iteration: 0, bestAccuracy, cost: this.totalCost })
+
+    // Iteration loop
+    let iteration = 0
+    let earlyStopReason: string | undefined
+
+    for (iteration = 1; iteration <= this.config.maxIterations; iteration++) {
+      // Cost guard
+      if (this.totalCost >= this.config.costBudget) {
+        earlyStopReason = `budget exhausted (${this.totalCost}/${this.config.costBudget} tokens)`
+        this.logBudget(iteration)
+        break
+      }
+
+      // Step 1: Evaluate frontier on train split + analyze failures
+      const allCandidates: ScoredVariant[] = [...frontier]
+
+      for (const frontierMember of frontier) {
+        const trainResult = await this.evaluateVariant(frontierMember.variant, trainTasks)
+        const failures = this.extractFailures(frontierMember.variant, trainTasks, trainResult)
+        const patterns = this.failureAnalyzer.analyze(failures)
+
+        // Step 2: Generate variants
+        const variants = await this.generator.generate({
+          skillId,
+          content: frontierMember.variant.content,
+          parentId: frontierMember.variant.id,
+          iteration,
+          failurePatterns: patterns,
+        })
+
+        // Step 3: Evaluate candidates on val split
+        for (const variant of variants) {
+          if (this.totalCost >= this.config.costBudget) break
+          const scored = await this.evaluateVariant(variant, valTasks)
+          allCandidates.push(scored)
+        }
+      }
+
+      // Step 4: Select new frontier
+      frontier = this.selector.select(allCandidates, this.config.frontierSize)
+
+      // Track best accuracy
+      const iterationBest = Math.max(...frontier.map((f) => f.accuracy))
+      if (iterationBest > bestAccuracy) {
+        bestAccuracy = iterationBest
+        stagnantIterations = 0
+      } else {
+        stagnantIterations++
+      }
+
+      this.log(iteration, bestAccuracy, frontier.length)
+      convergenceCurve.push({ iteration, bestAccuracy, cost: this.totalCost })
+
+      // Early stopping
+      if (stagnantIterations >= this.config.earlyStoppingPatience) {
+        earlyStopReason = `no improvement for ${this.config.earlyStoppingPatience} iterations`
+        break
+      }
+
+      this.generator.resetDedup()
+    }
+
+    // Final: evaluate best on test split
+    const bestVariant = frontier.reduce((a, b) => (a.accuracy >= b.accuracy ? a : b))
+    const testResult = await this.evaluateVariant(bestVariant.variant, testTasks)
+
+    return {
+      finalFrontier: frontier,
+      convergenceCurve,
+      totalIterations: iteration,
+      totalCost: this.totalCost,
+      earlyStopReason,
+      testAccuracy: testResult.accuracy,
+    }
+  }
+
+  private async evaluateVariant(variant: SkillVariant, tasks: EvalTask[]): Promise<ScoredVariant> {
+    let correct = 0
+    let evalCost = 0
+
+    for (const task of tasks) {
+      const result = await this.config.agentRunner.run({
+        skillContent: variant.content,
+        question: task.question,
+        modelId: this.config.taskModelId,
+      })
+
+      const score = await this.config.scorer(task.question, result.predicted, task.groundTruth)
+      if (score >= 0.5) correct++
+      evalCost += result.costTokens
+    }
+
+    this.totalCost += evalCost
+
+    return {
+      variant,
+      accuracy: tasks.length > 0 ? correct / tasks.length : 0,
+      cost: (variant.costTokens ?? 0) + evalCost,
+      skillSize: variant.content.split('\n').length,
+    }
+  }
+
+  private extractFailures(
+    variant: SkillVariant,
+    tasks: EvalTask[],
+    _scored: ScoredVariant
+  ): Array<{
+    taskId: string
+    predicted: string
+    groundTruth: string
+    agentOutput: string
+    toolCallFailed?: boolean
+    toolCallCount?: number
+  }> {
+    // In production, this would use cached agent outputs from evaluateVariant.
+    // For the iteration loop, we re-run and collect failures.
+    // This is a simplification — the real implementation would cache results.
+    void variant
+    void tasks
+    return []
+  }
+
+  private log(iteration: number, bestAccuracy: number, frontierSize: number): void {
+    const max = this.config.maxIterations
+    const cost = `${Math.round(this.totalCost / 1000)}K tokens`
+    console.log(
+      `[IterativeEvaluator] [iteration=${iteration}/${max}] [best_accuracy=${bestAccuracy.toFixed(2)}] [frontier_size=${frontierSize}] [cost=${cost}]`
+    )
+  }
+
+  private logBudget(iteration: number): void {
+    const max = this.config.maxIterations
+    console.log(
+      `[IterativeEvaluator] [BUDGET] stopping at iteration=${iteration}/${max} — budget exhausted (${this.totalCost}/${this.config.costBudget} tokens)`
+    )
+  }
+}
diff --git a/packages/core/src/evaluation/SkillVariantGenerator.ts b/packages/core/src/evaluation/SkillVariantGenerator.ts
new file mode 100644
index 00000000..fe748d58
--- /dev/null
+++ b/packages/core/src/evaluation/SkillVariantGenerator.ts
@@ -0,0 +1,251 @@
+/**
+ * @fileoverview SkillVariantGenerator — produce improved skill variants
+ * @module @skillsmith/core/evaluation/SkillVariantGenerator
+ * @see SMI-3296: 4 generation strategies (decompose, augment, specialize, LLM rewrite)
+ *
+ * Strategies ordered by cost:
+ *  1. Decompose (0 tokens) — split large skills via SkillDecomposer
+ *  2. Augment (0 tokens) — append failure fixes to skill content
+ *  3. Specialize (0 tokens) — remove irrelevant sections for benchmark domain
+ *  4. LLM Rewrite (~5K tokens) — Claude rewrites skill based on failure patterns
+ */
+
+import { createHash, randomUUID } from 'crypto'
+import type { FailurePattern, GenerationMethod, SkillVariant } from './types.js'
+
+/** LLM client for rewrite strategy — injected to avoid SDK dependency */
+export interface RewriteClient {
+  rewrite(params: {
+    model: string
+    skillContent: string
+    failurePatterns: FailurePattern[]
+    benchmarkDomain: string
+  }): Promise<string>
+}
+
+/** Configuration for SkillVariantGenerator */
+export interface VariantGeneratorConfig {
+  strategies: GenerationMethod[]
+  rewriteModelId: string
+  rewriteClient?: RewriteClient
+  benchmarkDomain: string
+}
+
+const DEFAULT_CONFIG: VariantGeneratorConfig = {
+  strategies: ['augment', 'decompose'],
+  rewriteModelId: 'claude-sonnet-4-6',
+  benchmarkDomain: 'general',
+}
+
+/** Compute SHA-256 content hash */
+function contentHash(content: string): string {
+  return createHash('sha256').update(content, 'utf-8').digest('hex')
+}
+
+/** Count non-empty lines in content */
+function lineCount(content: string): number {
+  return content.split('\n').length
+}
+
+export class SkillVariantGenerator {
+  private readonly config: VariantGeneratorConfig
+  private readonly seenHashes: Set<string>
+
+  constructor(config?: Partial<VariantGeneratorConfig>) {
+    this.config = { ...DEFAULT_CONFIG, ...config }
+    this.seenHashes = new Set()
+  }
+
+  /**
+   * Generate variants from a skill using all configured strategies.
+   * Deduplicates by content hash — identical outputs from different
+   * strategies or frontier members are returned only once.
+   */
+  async generate(params: {
+    skillId: string
+    content: string
+    parentId: string | null
+    iteration: number
+    failurePatterns: FailurePattern[]
+  }): Promise<SkillVariant[]> {
+    const variants: SkillVariant[] = []
+
+    for (const strategy of this.config.strategies) {
+      const result = await this.applyStrategy(strategy, params)
+      if (result === null) continue
+
+      const hash = contentHash(result)
+      if (this.seenHashes.has(hash)) continue
+
+      this.seenHashes.add(hash)
+      variants.push({
+        id: randomUUID(),
+        contentHash: hash,
+        content: result,
+        parentId: params.parentId,
+        skillId: params.skillId,
+        iteration: params.iteration,
+        generationMethod: strategy,
+        contentLines: lineCount(result),
+        costTokens: strategy === 'llm_rewrite' ? result.length : 0,
+      })
+    }
+
+    return variants
+  }
+
+  /** Reset seen hashes between runs */
+  resetDedup(): void {
+    this.seenHashes.clear()
+  }
+
+  private async applyStrategy(
+    strategy: GenerationMethod,
+    params: {
+      content: string
+      failurePatterns: FailurePattern[]
+    }
+  ): Promise<string | null> {
+    switch (strategy) {
+      case 'decompose':
+        return this.decompose(params.content)
+      case 'augment':
+        return this.augment(params.content, params.failurePatterns)
+      case 'specialize':
+        return this.specialize(params.content)
+      case 'llm_rewrite':
+        return this.llmRewrite(params.content, params.failurePatterns)
+      case 'baseline':
+        return null
+    }
+  }
+
+  /**
+   * Strategy 1: Decompose — split large skills via structural analysis.
+   * Only applicable if source skill >200 lines.
+   * Returns simplified main skill content (sub-skills not tracked individually).
+   */
+  private decompose(content: string): string | null {
+    const sourceLines = lineCount(content)
+    if (sourceLines <= 200) return null
+
+    // Extract first major section as a focused variant
+    const lines = content.split('\n')
+    const sectionStarts: number[] = []
+    for (let i = 0; i < lines.length; i++) {
+      if (lines[i].startsWith('## ')) {
+        sectionStarts.push(i)
+      }
+    }
+
+    if (sectionStarts.length < 2) return null
+
+    // Keep header + first 2 sections as a focused sub-skill
+    const cutoff = sectionStarts.length >= 3 ? sectionStarts[2] : lines.length
+    const focused = lines.slice(0, cutoff).join('\n').trim()
+
+    // Only return if meaningfully shorter
+    if (lineCount(focused) >= sourceLines * 0.8) return null
+
+    return focused
+  }
+
+  /**
+   * Strategy 2: Augment — append top-3 failure fixes to skill content.
+   * If `## Skill Improvement Notes` already exists, replace it.
+   */
+  private augment(content: string, failurePatterns: FailurePattern[]): string | null {
+    if (failurePatterns.length === 0) return null
+
+    const top3 = failurePatterns
+      .slice(0, 3)
+      .map((p) => `- **${p.category}** (${p.frequency} occurrences): ${p.suggestedFix}`)
+      .join('\n')
+
+    const section = `\n\n## Skill Improvement Notes\n\n${top3}\n`
+
+    // Replace existing section if present
+    const sectionRegex = /\n## Skill Improvement Notes\n[\s\S]*?(?=\n## |\n*$)/
+    if (sectionRegex.test(content)) {
+      return content.replace(sectionRegex, section).trim()
+    }
+
+    return (content.trimEnd() + section).trim()
+  }
+
+  /**
+   * Strategy 3: Specialize — remove generic sections irrelevant to benchmark.
+   * Strips sections that don't mention the benchmark domain keywords.
+   */
+  private specialize(content: string): string | null {
+    const domain = this.config.benchmarkDomain.toLowerCase()
+    if (domain === 'general') return null
+
+    const lines = content.split('\n')
+    const result: string[] = []
+    let inSection = false
+    let sectionLines: string[] = []
+    let sectionRelevant = false
+
+    const domainKeywords = domain.split(/[\s,]+/)
+
+    for (const line of lines) {
+      if (line.startsWith('## ')) {
+        // Flush previous section
+        if (inSection && sectionRelevant) {
+          result.push(...sectionLines)
+        }
+        inSection = true
+        sectionLines = [line]
+        sectionRelevant = false
+      } else if (inSection) {
+        sectionLines.push(line)
+        const lower = line.toLowerCase()
+        if (domainKeywords.some((kw) => lower.includes(kw))) {
+          sectionRelevant = true
+        }
+      } else {
+        // Header content before first ##
+        result.push(line)
+      }
+    }
+
+    // Flush last section
+    if (inSection && sectionRelevant) {
+      result.push(...sectionLines)
+    }
+
+    const specialized = result.join('\n').trim()
+
+    // Only return if meaningfully shorter (>10% reduction)
+    if (specialized.length >= content.length * 0.9) return null
+    // Must retain at least some content
+    if (specialized.length < 50) return null
+
+    return specialized
+  }
+
+  /**
+   * Strategy 4: LLM Rewrite — send skill + failures to Claude for creative rewrite.
+   * Requires injected RewriteClient.
+   */
+  private async llmRewrite(
+    content: string,
+    failurePatterns: FailurePattern[]
+  ): Promise<string | null> {
+    if (!this.config.rewriteClient) return null
+    if (failurePatterns.length === 0) return null
+
+    const result = await this.config.rewriteClient.rewrite({
+      model: this.config.rewriteModelId,
+      skillContent: content,
+      failurePatterns,
+      benchmarkDomain: this.config.benchmarkDomain,
+    })
+
+    // Ensure result is different from input
+    if (contentHash(result) === contentHash(content)) return null
+
+    return result
+  }
+}
diff --git a/packages/core/src/evaluation/VariantSelector.ts b/packages/core/src/evaluation/VariantSelector.ts
new file mode 100644
index 00000000..28c274b9
--- /dev/null
+++ b/packages/core/src/evaluation/VariantSelector.ts
@@ -0,0 +1,58 @@
+/**
+ * @fileoverview VariantSelector — Pareto frontier selection for skill variants
+ * @module @skillsmith/core/evaluation/VariantSelector
+ * @see SMI-3297: Select non-dominated variants by accuracy vs cost
+ *
+ * Pareto dominance: A dominates B if A.accuracy >= B.accuracy AND A.cost <= B.cost
+ * with at least one strict inequality.
+ * Tiebreaker: prefer smaller skillSize (fewer tokens in context).
+ */
+
+import type { ScoredVariant } from './types.js'
+
+export class VariantSelector {
+  /**
+   * Select top non-dominated variants from candidates.
+   * Returns at most `frontierSize` variants from the Pareto frontier.
+   *
+   * @param candidates - Scored variants to select from
+   * @param frontierSize - Maximum number of variants to retain
+   * @returns Non-dominated variants, sorted by accuracy descending
+   */
+  select(candidates: ScoredVariant[], frontierSize: number): ScoredVariant[] {
+    if (candidates.length === 0) return []
+    if (candidates.length <= frontierSize) {
+      return this.filterDominated(candidates)
+    }
+
+    const frontier = this.filterDominated(candidates)
+
+    if (frontier.length <= frontierSize) return frontier
+
+    // More non-dominated than we need — pick by accuracy + tiebreak on skillSize
+    return frontier
+      .sort((a, b) => {
+        const accDiff = b.accuracy - a.accuracy
+        if (Math.abs(accDiff) > 1e-9) return accDiff
+        return a.skillSize - b.skillSize
+      })
+      .slice(0, frontierSize)
+  }
+
+  /**
+   * Remove dominated variants from the set.
+   * A variant is dominated if any other variant has >= accuracy AND <= cost
+   * with at least one strict inequality.
+   */
+  private filterDominated(candidates: ScoredVariant[]): ScoredVariant[] {
+    return candidates.filter((candidate, _i) => {
+      return !candidates.some(
+        (other) =>
+          other !== candidate &&
+          other.accuracy >= candidate.accuracy &&
+          other.cost <= candidate.cost &&
+          (other.accuracy > candidate.accuracy || other.cost < candidate.cost)
+      )
+    })
+  }
+}
diff --git a/packages/core/src/evaluation/index.ts b/packages/core/src/evaluation/index.ts
new file mode 100644
index 00000000..d9e6ace7
--- /dev/null
+++ b/packages/core/src/evaluation/index.ts
@@ -0,0 +1,31 @@
+// Evaluation module barrel export — EvoSkill Study B (task-accuracy evaluator)
+export { FailureAnalyzer } from './FailureAnalyzer.js'
+export { SkillVariantGenerator } from './SkillVariantGenerator.js'
+export type { RewriteClient, VariantGeneratorConfig } from './SkillVariantGenerator.js'
+export { VariantSelector } from './VariantSelector.js'
+export { IterativeEvaluator } from './IterativeEvaluator.js'
+export type {
+  AgentRunner,
+  EvalTask,
+  IterativeConfig,
+  IterativeResult,
+  IterationSnapshot,
+} from './IterativeEvaluator.js'
+export type {
+  FailureAnalyzerConfig,
+  FailureCategory,
+  FailurePattern,
+  TaskFailure,
+  GenerationMethod,
+  SkillVariant,
+  ScoredVariant,
+  BenchmarkId,
+  SplitType,
+  ScorerType,
+  BenchmarkResultRow,
+  BenchmarkResultInput,
+  SkillVariantRow,
+  SkillVariantInput,
+  FailurePatternRow,
+  FailurePatternInput,
+} from './types.js'
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
new file mode 100644
index 00000000..c2b711d9
--- /dev/null
+++ b/packages/core/src/evaluation/types.ts
@@ -0,0 +1,177 @@
+/**
+ * @fileoverview Types for the EvoSkill task-accuracy evaluator (Study B)
+ * @module @skillsmith/core/evaluation/types
+ * @see Plan: docs/internal/implementation/evoskill-task-accuracy-evaluator.md
+ */
+
+// ============================================================================
+// Failure Analysis Types
+// ============================================================================
+
+/** Categories of task failures detected by FailureAnalyzer */
+export type FailureCategory =
+  | 'wrong_format'
+  | 'missing_context'
+  | 'reasoning_error'
+  | 'tool_misuse'
+  | 'hallucination'
+
+/** A single task failure with agent output and ground truth */
+export interface TaskFailure {
+  taskId: string
+  predicted: string
+  groundTruth: string
+  agentOutput: string
+  toolCallFailed?: boolean
+  toolCallCount?: number
+}
+
+/** A categorized failure pattern with frequency and fix suggestion */
+export interface FailurePattern {
+  category: FailureCategory
+  frequency: number
+  examples: TaskFailure[] // max 5 representative examples
+  suggestedFix: string // natural language improvement for skill content
+}
+
+/** Configuration for FailureAnalyzer */
+export interface FailureAnalyzerConfig {
+  mode: 'heuristic' | 'llm'
+  maxExamplesPerCategory?: number // default: 5
+}
+
+// ============================================================================
+// Skill Variant Types
+// ============================================================================
+
+/** Generation methods for producing skill variants */
+export type GenerationMethod = 'baseline' | 'decompose' | 'augment' | 'specialize' | 'llm_rewrite'
+
+/** A skill variant generated during iterative evaluation */
+export interface SkillVariant {
+  id: string // UUID (primary key for DB references)
+  contentHash: string // SHA-256 of content (deduplication key)
+  content: string // SKILL.md content
+  parentId: string | null // derivation lineage
+  skillId: string
+  iteration: number
+  generationMethod: GenerationMethod
+  contentLines?: number
+  costTokens?: number
+}
+
+/** A variant scored on accuracy and cost for Pareto selection */
+export interface ScoredVariant {
+  variant: SkillVariant
+  accuracy: number // 0-1 on validation split
+  cost: number // tokens consumed during generation
+  skillSize: number // lines in SKILL.md
+}
+
+// ============================================================================
+// Benchmark Result Types (DB rows)
+// ============================================================================
+
+/** Benchmark identifiers supported by the evaluator */
+export type BenchmarkId = 'officeqa' | 'sealqa' | 'browsecomp'
+
+/** Data split types */
+export type SplitType = 'train' | 'val' | 'test'
+
+/** Scorer types */
+export type ScorerType = 'exact_match' | 'llm_judge'
+
+/** Row shape for benchmark_results table */
+export interface BenchmarkResultRow {
+  id: string
+  skill_id: string
+  skill_variant_hash: string
+  benchmark: BenchmarkId
+  split: SplitType
+  condition: string
+  iteration: number
+  accuracy: number
+  task_count: number
+  correct_count: number
+  cost_tokens: number | null
+  cost_dollars: number | null
+  wall_clock_ms: number | null
+  scorer: ScorerType
+  model_id: string
+  seed: number
+  created_at: string
+}
+
+/** Input for inserting a benchmark result */
+export interface BenchmarkResultInput {
+  id: string
+  skillId: string
+  skillVariantHash: string
+  benchmark: BenchmarkId
+  split: SplitType
+  condition: string
+  iteration?: number
+  accuracy: number
+  taskCount: number
+  correctCount: number
+  costTokens?: number
+  costDollars?: number
+  wallClockMs?: number
+  scorer: ScorerType
+  modelId: string
+  seed: number
+}
+
+/** Row shape for skill_variants table */
+export interface SkillVariantRow {
+  id: string
+  skill_id: string
+  parent_variant_id: string | null
+  content_hash: string
+  iteration: number
+  generation_method: GenerationMethod
+  accuracy_train: number | null
+  accuracy_val: number | null
+  accuracy_test: number | null
+  content_lines: number | null
+  cost_tokens: number | null
+  is_frontier: number // 0 or 1
+  created_at: string
+}
+
+/** Input for inserting a skill variant */
+export interface SkillVariantInput {
+  id: string
+  skillId: string
+  parentVariantId?: string | null
+  contentHash: string
+  iteration: number
+  generationMethod: GenerationMethod
+  accuracyTrain?: number | null
+  accuracyVal?: number | null
+  accuracyTest?: number | null
+  contentLines?: number | null
+  costTokens?: number | null
+  isFrontier?: boolean
+}
+
+/** Row shape for failure_patterns table */
+export interface FailurePatternRow {
+  id: string
+  benchmark_result_id: string
+  category: FailureCategory
+  frequency: number
+  example_tasks: string | null // JSON array of task IDs
+  suggested_fix: string | null
+  created_at: string
+}
+
+/** Input for inserting a failure pattern */
+export interface FailurePatternInput {
+  id: string
+  benchmarkResultId: string
+  category: FailureCategory
+  frequency: number
+  exampleTasks?: string[] // task IDs
+  suggestedFix?: string
+}
diff --git a/packages/core/src/exports/repositories.ts b/packages/core/src/exports/repositories.ts
index d59555d1..7ddf45fb 100644
--- a/packages/core/src/exports/repositories.ts
+++ b/packages/core/src/exports/repositories.ts
@@ -139,3 +139,9 @@ export { CoInstallRepository, type CoInstallSummary } from '../repositories/CoIn
 // ============================================================================
 
 export { SkillDependencyRepository } from '../repositories/SkillDependencyRepository.js'
+
+// ============================================================================
+// Benchmark Repository (SMI-3292)
+// ============================================================================
+
+export { BenchmarkRepository } from '../repositories/BenchmarkRepository.js'
diff --git a/packages/core/src/exports/services.ts b/packages/core/src/exports/services.ts
index da24530f..a708dd4e 100644
--- a/packages/core/src/exports/services.ts
+++ b/packages/core/src/exports/services.ts
@@ -365,3 +365,9 @@ export {
   type BillingErrorCode,
   type LicenseTier,
 } from '../billing/index.js'
+
+// ============================================================================
+// Evaluation (SMI-3284: EvoSkill Task-Accuracy Evaluator)
+// ============================================================================
+
+export { FailureAnalyzer } from '../evaluation/FailureAnalyzer.js'
diff --git a/packages/core/src/exports/types.ts b/packages/core/src/exports/types.ts
index 920525b3..a885f182 100644
--- a/packages/core/src/exports/types.ts
+++ b/packages/core/src/exports/types.ts
@@ -246,3 +246,26 @@ export type {
   AuditLoggerConfig,
   AuditStats,
 } from '../security/AuditLogger.js'
+
+// ============================================================================
+// Evaluation Types (SMI-3284: EvoSkill Task-Accuracy Evaluator)
+// ============================================================================
+
+export type {
+  FailureCategory,
+  TaskFailure,
+  FailurePattern,
+  FailureAnalyzerConfig,
+  GenerationMethod,
+  SkillVariant,
+  ScoredVariant,
+  BenchmarkId,
+  SplitType,
+  ScorerType,
+  BenchmarkResultRow,
+  BenchmarkResultInput,
+  SkillVariantRow,
+  SkillVariantInput,
+  FailurePatternRow,
+  FailurePatternInput,
+} from '../evaluation/types.js'
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index c16a2530..13aa084b 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -102,6 +102,62 @@ export {
   validateEmbeddingResults,
 } from './benchmarks/index.js'
 
+// EvoSkill benchmark harness (SMI-3255, SMI-3258)
+export {
+  // IR metrics
+  ndcg, mrr, mapAtK, precisionAtK, recallAtK,
+  // Scorers
+  exactMatchScorer, createLlmJudgeScorer, getScorerForBenchmark,
+  // Constants
+  EVOSKILL_DEFAULTS, CONDITIONS, NotImplementedError,
+  // Dataset
+  loadDataset, loadCSVDataset, loadJSONDataset,
+  // Skill selectors
+  createBaselineSelector, createEvoSkillEvolvedSelector,
+  createSearchSelector, createRecommendSelector,
+  createOptimizedSelector, createSkillCreateSelector,
+  createIterativeSelector, createHybridSelector, createCuratedSelector,
+  // Agent runner
+  runEvoSkillTask, runEvoSkillBatch, calculateCost,
+  // Evaluator
+  evaluate, aggregateSeeds,
+  // Harness
+  runHarness,
+  // Report
+  generateMarkdownReport, generateJsonReport,
+} from './benchmarks/evoskill/index.js'
+
+// EvoSkill types (SMI-3255, SMI-3258)
+export type {
+  BenchmarkTask, ConditionConfig, EvoSkillBenchmarkResult, ScorerFn, HarnessConfig,
+  LlmJudgeClient, DatasetLoadResult, SkillSelectorFn,
+  SkillsmithSearchClient, SkillsmithRecommendClient,
+  TransformationService, SkillCreateRunner,
+  ConditionNumber, ConditionName,
+  AgentClient, AgentRunnerConfig, TaskResult, TaskTokenUsage,
+  EvaluatorConfig, HarnessDependencies, HarnessResult,
+  HarnessProgressFn, HarnessProgressEvent, ReportOptions,
+} from './benchmarks/evoskill/index.js'
+
+// EvoSkill evaluation (Study B: SMI-3284)
+export {
+  FailureAnalyzer,
+  SkillVariantGenerator,
+  VariantSelector,
+  IterativeEvaluator,
+} from './evaluation/index.js'
+
+export type {
+  RewriteClient, VariantGeneratorConfig,
+  AgentRunner, EvalTask, IterativeConfig, IterativeResult, IterationSnapshot,
+  FailureAnalyzerConfig, FailureCategory, FailurePattern, TaskFailure,
+  GenerationMethod, SkillVariant, ScoredVariant,
+  BenchmarkId, SplitType, ScorerType,
+  BenchmarkResultRow, BenchmarkResultInput,
+  SkillVariantRow, SkillVariantInput,
+  FailurePatternRow, FailurePatternInput,
+} from './evaluation/index.js'
+
 // Telemetry (SMI-739)
 export {
   SkillsmithTracer,
diff --git a/packages/core/src/repositories/BenchmarkRepository.ts b/packages/core/src/repositories/BenchmarkRepository.ts
new file mode 100644
index 00000000..1f53764f
--- /dev/null
+++ b/packages/core/src/repositories/BenchmarkRepository.ts
@@ -0,0 +1,230 @@
+/**
+ * @fileoverview Repository for EvoSkill benchmark evaluator tables
+ * @module @skillsmith/core/repositories/BenchmarkRepository
+ * @see SMI-3292: BenchmarkRepository CRUD + migration
+ *
+ * Provides CRUD operations for:
+ *  - benchmark_results: evaluation results across conditions/benchmarks/splits
+ *  - skill_variants: skill variants generated during iterative evaluation
+ *  - failure_patterns: categorized failure patterns per evaluation
+ */
+
+import type { Database } from '../db/database-interface.js'
+import type {
+  BenchmarkResultInput,
+  BenchmarkResultRow,
+  SkillVariantInput,
+  SkillVariantRow,
+  FailurePatternInput,
+  FailurePatternRow,
+  BenchmarkId,
+  SplitType,
+} from '../evaluation/types.js'
+
+export class BenchmarkRepository {
+  constructor(private readonly db: Database) {}
+
+  // ==========================================================================
+  // benchmark_results
+  // ==========================================================================
+
+  insertResult(input: BenchmarkResultInput): void {
+    const stmt = this.db.prepare(`
+      INSERT INTO benchmark_results
+        (id, skill_id, skill_variant_hash, benchmark, split, condition,
+         iteration, accuracy, task_count, correct_count, cost_tokens,
+         cost_dollars, wall_clock_ms, scorer, model_id, seed)
+      VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    `)
+    stmt.run(
+      input.id,
+      input.skillId,
+      input.skillVariantHash,
+      input.benchmark,
+      input.split,
+      input.condition,
+      input.iteration ?? 0,
+      input.accuracy,
+      input.taskCount,
+      input.correctCount,
+      input.costTokens ?? null,
+      input.costDollars ?? null,
+      input.wallClockMs ?? null,
+      input.scorer,
+      input.modelId,
+      input.seed
+    )
+  }
+
+  getResult(id: string): BenchmarkResultRow | undefined {
+    return this.db.prepare('SELECT * FROM benchmark_results WHERE id = ?').get(id) as
+      | BenchmarkResultRow
+      | undefined
+  }
+
+  getResultsBySkill(
+    skillId: string,
+    benchmark?: BenchmarkId,
+    split?: SplitType
+  ): BenchmarkResultRow[] {
+    let sql = 'SELECT * FROM benchmark_results WHERE skill_id = ?'
+    const params: unknown[] = [skillId]
+
+    if (benchmark) {
+      sql += ' AND benchmark = ?'
+      params.push(benchmark)
+    }
+    if (split) {
+      sql += ' AND split = ?'
+      params.push(split)
+    }
+
+    sql += ' ORDER BY created_at DESC'
+    return this.db.prepare(sql).all(...params) as BenchmarkResultRow[]
+  }
+
+  getResultsByCondition(condition: string, benchmark: BenchmarkId): BenchmarkResultRow[] {
+    return this.db
+      .prepare(
+        `SELECT * FROM benchmark_results
+         WHERE condition = ? AND benchmark = ?
+         ORDER BY iteration ASC, seed ASC`
+      )
+      .all(condition, benchmark) as BenchmarkResultRow[]
+  }
+
+  deleteResult(id: string): boolean {
+    const info = this.db.prepare('DELETE FROM benchmark_results WHERE id = ?').run(id)
+    return info.changes > 0
+  }
+
+  // ==========================================================================
+  // skill_variants
+  // ==========================================================================
+
+  insertVariant(input: SkillVariantInput): void {
+    const stmt = this.db.prepare(`
+      INSERT INTO skill_variants
+        (id, skill_id, parent_variant_id, content_hash, iteration,
+         generation_method, accuracy_train, accuracy_val, accuracy_test,
+         content_lines, cost_tokens, is_frontier)
+      VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    `)
+    stmt.run(
+      input.id,
+      input.skillId,
+      input.parentVariantId ?? null,
+      input.contentHash,
+      input.iteration,
+      input.generationMethod,
+      input.accuracyTrain ?? null,
+      input.accuracyVal ?? null,
+      input.accuracyTest ?? null,
+      input.contentLines ?? null,
+      input.costTokens ?? null,
+      input.isFrontier ? 1 : 0
+    )
+  }
+
+  getVariant(id: string): SkillVariantRow | undefined {
+    return this.db.prepare('SELECT * FROM skill_variants WHERE id = ?').get(id) as
+      | SkillVariantRow
+      | undefined
+  }
+
+  getVariantByHash(skillId: string, contentHash: string): SkillVariantRow | undefined {
+    return this.db
+      .prepare('SELECT * FROM skill_variants WHERE skill_id = ? AND content_hash = ?')
+      .get(skillId, contentHash) as SkillVariantRow | undefined
+  }
+
+  getFrontierVariants(skillId: string): SkillVariantRow[] {
+    return this.db
+      .prepare(
+        `SELECT * FROM skill_variants
+         WHERE skill_id = ? AND is_frontier = 1
+         ORDER BY accuracy_val DESC NULLS LAST`
+      )
+      .all(skillId) as SkillVariantRow[]
+  }
+
+  updateVariantAccuracy(
+    id: string,
+    accuracyTrain: number | null,
+    accuracyVal: number | null,
+    accuracyTest: number | null
+  ): boolean {
+    const info = this.db
+      .prepare(
+        `UPDATE skill_variants
+         SET accuracy_train = ?, accuracy_val = ?, accuracy_test = ?
+         WHERE id = ?`
+      )
+      .run(accuracyTrain, accuracyVal, accuracyTest, id)
+    return info.changes > 0
+  }
+
+  setFrontier(id: string, isFrontier: boolean): boolean {
+    const info = this.db
+      .prepare('UPDATE skill_variants SET is_frontier = ? WHERE id = ?')
+      .run(isFrontier ? 1 : 0, id)
+    return info.changes > 0
+  }
+
+  clearFrontier(skillId: string): void {
+    this.db.prepare('UPDATE skill_variants SET is_frontier = 0 WHERE skill_id = ?').run(skillId)
+  }
+
+  deleteVariant(id: string): boolean {
+    const info = this.db.prepare('DELETE FROM skill_variants WHERE id = ?').run(id)
+    return info.changes > 0
+  }
+
+  // ==========================================================================
+  // failure_patterns
+  // ==========================================================================
+
+  insertPattern(input: FailurePatternInput): void {
+    const stmt = this.db.prepare(`
+      INSERT INTO failure_patterns
+        (id, benchmark_result_id, category, frequency, example_tasks, suggested_fix)
+      VALUES (?, ?, ?, ?, ?, ?)
+    `)
+    stmt.run(
+      input.id,
+      input.benchmarkResultId,
+      input.category,
+      input.frequency,
+      input.exampleTasks ? JSON.stringify(input.exampleTasks) : null,
+      input.suggestedFix ?? null
+    )
+  }
+
+  getPattern(id: string): FailurePatternRow | undefined {
+    return this.db.prepare('SELECT * FROM failure_patterns WHERE id = ?').get(id) as
+      | FailurePatternRow
+      | undefined
+  }
+
+  getPatternsByResult(benchmarkResultId: string): FailurePatternRow[] {
+    return this.db
+      .prepare(
+        `SELECT * FROM failure_patterns
+         WHERE benchmark_result_id = ?
+         ORDER BY frequency DESC`
+      )
+      .all(benchmarkResultId) as FailurePatternRow[]
+  }
+
+  deletePattern(id: string): boolean {
+    const info = this.db.prepare('DELETE FROM failure_patterns WHERE id = ?').run(id)
+    return info.changes > 0
+  }
+
+  deletePatternsByResult(benchmarkResultId: string): number {
+    const info = this.db
+      .prepare('DELETE FROM failure_patterns WHERE benchmark_result_id = ?')
+      .run(benchmarkResultId)
+    return info.changes
+  }
+}
diff --git a/packages/core/tests/benchmarks/agent-runner.test.ts b/packages/core/tests/benchmarks/agent-runner.test.ts
new file mode 100644
index 00000000..055afd73
--- /dev/null
+++ b/packages/core/tests/benchmarks/agent-runner.test.ts
@@ -0,0 +1,139 @@
+import { describe, it, expect, vi } from 'vitest'
+import { runEvoSkillTask, runEvoSkillBatch, calculateCost } from '../../src/benchmarks/evoskill/agent-runner.js'
+import type { BenchmarkTask } from '../../src/benchmarks/evoskill/types.js'
+import type { AgentClient } from '../../src/benchmarks/evoskill/agent-runner.js'
+
+const task: BenchmarkTask = {
+  id: 'test-1',
+  question: 'What is 2+2?',
+  groundTruth: '4',
+  split: 'test',
+  benchmark: 'officeqa',
+}
+
+const mockClient: AgentClient = {
+  async runTask() {
+    return { content: '4', inputTokens: 100, outputTokens: 50 }
+  },
+}
+
+describe('runEvoSkillTask', () => {
+  it('returns predicted content and tokens', async () => {
+    const result = await runEvoSkillTask(task, {
+      client: mockClient,
+      modelId: 'claude-sonnet-4-6',
+      skills: [],
+    })
+
+    expect(result.taskId).toBe('test-1')
+    expect(result.predicted).toBe('4')
+    expect(result.tokens.inputTokens).toBe(100)
+    expect(result.tokens.outputTokens).toBe(50)
+    expect(result.durationMs).toBeGreaterThanOrEqual(0)
+    expect(result.error).toBeUndefined()
+  })
+
+  it('captures errors gracefully', async () => {
+    const failClient: AgentClient = {
+      async runTask() { throw new Error('API error') },
+    }
+
+    const result = await runEvoSkillTask(task, {
+      client: failClient,
+      modelId: 'claude-sonnet-4-6',
+      skills: [],
+    })
+
+    expect(result.predicted).toBe('')
+    expect(result.error).toBe('API error')
+    expect(result.tokens.inputTokens).toBe(0)
+  })
+
+  it('retries on rate limit errors', async () => {
+    let attempts = 0
+    const rateLimitClient: AgentClient = {
+      async runTask() {
+        attempts++
+        if (attempts < 3) throw new Error('429 rate limit exceeded')
+        return { content: 'ok', inputTokens: 10, outputTokens: 5 }
+      },
+    }
+
+    const result = await runEvoSkillTask(task, {
+      client: rateLimitClient,
+      modelId: 'claude-sonnet-4-6',
+      skills: [],
+    })
+
+    expect(result.predicted).toBe('ok')
+    expect(attempts).toBe(3)
+  })
+
+  it('does not retry on non-rate-limit errors', async () => {
+    let attempts = 0
+    const errorClient: AgentClient = {
+      async runTask() {
+        attempts++
+        throw new Error('Invalid request')
+      },
+    }
+
+    const result = await runEvoSkillTask(task, {
+      client: errorClient,
+      modelId: 'claude-sonnet-4-6',
+      skills: [],
+    })
+
+    expect(attempts).toBe(1)
+    expect(result.error).toBe('Invalid request')
+  })
+})
+
+describe('runEvoSkillBatch', () => {
+  it('runs all tasks and reports progress', async () => {
+    const tasks: BenchmarkTask[] = [
+      { ...task, id: 't1' },
+      { ...task, id: 't2' },
+      { ...task, id: 't3' },
+    ]
+
+    const progress: Array<[number, number]> = []
+    const results = await runEvoSkillBatch(tasks, {
+      client: mockClient,
+      modelId: 'claude-sonnet-4-6',
+      skills: [],
+    }, (completed, total) => progress.push([completed, total]))
+
+    expect(results).toHaveLength(3)
+    expect(progress).toEqual([[1, 3], [2, 3], [3, 3]])
+  })
+})
+
+describe('calculateCost', () => {
+  it('calculates cost for sonnet model', () => {
+    const cost = calculateCost(
+      { inputTokens: 1000, outputTokens: 500 },
+      'claude-sonnet-4-6'
+    )
+    // 1000 * 3e-6 + 500 * 15e-6 = 0.003 + 0.0075 = 0.0105
+    expect(cost).toBeCloseTo(0.0105)
+  })
+
+  it('calculates cost for opus model', () => {
+    const cost = calculateCost(
+      { inputTokens: 1000, outputTokens: 500 },
+      'claude-opus-4-6'
+    )
+    // 1000 * 15e-6 + 500 * 75e-6 = 0.015 + 0.0375 = 0.0525
+    expect(cost).toBeCloseTo(0.0525)
+  })
+
+  it('uses default pricing for unknown models', () => {
+    const cost = calculateCost(
+      { inputTokens: 1000, outputTokens: 500 },
+      'unknown-model'
+    )
+    // Uses default (same as sonnet)
+    expect(cost).toBeCloseTo(0.0105)
+  })
+})
diff --git a/packages/core/tests/benchmarks/dataset-loader.test.ts b/packages/core/tests/benchmarks/dataset-loader.test.ts
new file mode 100644
index 00000000..8bba2d76
--- /dev/null
+++ b/packages/core/tests/benchmarks/dataset-loader.test.ts
@@ -0,0 +1,140 @@
+import { describe, it, expect } from 'vitest'
+import { loadCSVDataset, loadJSONDataset, loadDataset } from '../../src/benchmarks/evoskill/dataset-loader.js'
+
+describe('loadCSVDataset', () => {
+  const csv = [
+    'question,answer',
+    'What is 2+2?,4',
+    'Capital of France?,Paris',
+    'Color of sky?,Blue',
+    'Largest planet?,Jupiter',
+    'Speed of light?,299792458',
+    'Boiling point of water?,100',
+    'Chemical symbol for gold?,Au',
+    'Year of moon landing?,1969',
+    'Pi to 2 decimals?,3.14',
+    'Continent of Brazil?,South America',
+  ].join('\n')
+
+  it('parses all rows', () => {
+    const result = loadCSVDataset(csv, 'officeqa', { seed: 42 })
+    expect(result.tasks).toHaveLength(10)
+  })
+
+  it('splits into train/val/test', () => {
+    const result = loadCSVDataset(csv, 'officeqa', { seed: 42 })
+    expect(result.train.length).toBeGreaterThan(0)
+    expect(result.val.length).toBeGreaterThan(0)
+    expect(result.test.length).toBeGreaterThan(0)
+    expect(result.train.length + result.val.length + result.test.length).toBe(10)
+  })
+
+  it('assigns correct split labels', () => {
+    const result = loadCSVDataset(csv, 'officeqa', { seed: 42 })
+    for (const t of result.train) expect(t.split).toBe('train')
+    for (const t of result.val) expect(t.split).toBe('val')
+    for (const t of result.test) expect(t.split).toBe('test')
+  })
+
+  it('uses default split ratios (18/12/70)', () => {
+    // With 10 items: train=2, val=1, test=7
+    const result = loadCSVDataset(csv, 'officeqa', { seed: 42 })
+    expect(result.train).toHaveLength(2)
+    expect(result.val).toHaveLength(1)
+    expect(result.test).toHaveLength(7)
+  })
+
+  it('is deterministic with same seed', () => {
+    const a = loadCSVDataset(csv, 'officeqa', { seed: 42 })
+    const b = loadCSVDataset(csv, 'officeqa', { seed: 42 })
+    expect(a.train.map((t) => t.id)).toEqual(b.train.map((t) => t.id))
+    expect(a.test.map((t) => t.id)).toEqual(b.test.map((t) => t.id))
+  })
+
+  it('produces different shuffle with different seed', () => {
+    const a = loadCSVDataset(csv, 'officeqa', { seed: 42 })
+    const b = loadCSVDataset(csv, 'officeqa', { seed: 99 })
+    // With different seeds, order should differ (overwhelmingly likely with 10 items)
+    const aIds = a.tasks.map((t) => t.id)
+    const bIds = b.tasks.map((t) => t.id)
+    expect(aIds).not.toEqual(bIds)
+  })
+
+  it('handles quoted CSV fields with commas', () => {
+    const csvWithCommas = [
+      'question,answer',
+      '"What is 1,000 + 2,000?","3,000"',
+      'Simple question?,Yes',
+    ].join('\n')
+    const result = loadCSVDataset(csvWithCommas, 'officeqa', { seed: 42 })
+    const task = result.tasks.find((t) => t.question.includes('1,000'))
+    expect(task).toBeDefined()
+    expect(task!.groundTruth).toBe('3,000')
+  })
+
+  it('supports ground_truth column name', () => {
+    const altCsv = 'question,ground_truth\nQ1?,A1\nQ2?,A2\n'
+    const result = loadCSVDataset(altCsv, 'sealqa', { seed: 42 })
+    expect(result.tasks).toHaveLength(2)
+    expect(result.tasks[0].groundTruth).toBeTruthy()
+  })
+
+  it('throws for empty dataset', () => {
+    expect(() => loadCSVDataset('question,answer\n', 'officeqa')).toThrow('fewer than 2 lines')
+  })
+
+  it('throws for missing columns', () => {
+    expect(() => loadCSVDataset('foo,bar\n1,2\n', 'officeqa')).toThrow('missing required columns')
+  })
+})
+
+describe('loadJSONDataset', () => {
+  const jsonData = JSON.stringify(
+    Array.from({ length: 20 }, (_, i) => ({
+      question: `Question ${i + 1}`,
+      answer: `Answer ${i + 1}`,
+    }))
+  )
+
+  it('parses all items', () => {
+    const result = loadJSONDataset(jsonData, 'browsecomp', { seed: 42 })
+    expect(result.tasks).toHaveLength(20)
+  })
+
+  it('splits correctly', () => {
+    const result = loadJSONDataset(jsonData, 'browsecomp', { seed: 42 })
+    // 20 items: train=4 (18%), val=2 (12%), test=14 (70%)
+    expect(result.train).toHaveLength(4)
+    expect(result.val).toHaveLength(2)
+    expect(result.test).toHaveLength(14)
+  })
+
+  it('assigns browsecomp benchmark', () => {
+    const result = loadJSONDataset(jsonData, 'browsecomp', { seed: 42 })
+    for (const t of result.tasks) expect(t.benchmark).toBe('browsecomp')
+  })
+
+  it('throws for empty array', () => {
+    expect(() => loadJSONDataset('[]', 'browsecomp')).toThrow('empty')
+  })
+})
+
+describe('loadDataset', () => {
+  it('routes CSV for officeqa', () => {
+    const csv = 'question,answer\nQ?,A\nQ2?,A2\nQ3?,A3\nQ4?,A4\nQ5?,A5\n'
+    const result = loadDataset(csv, 'officeqa', { seed: 42 })
+    expect(result.tasks[0].benchmark).toBe('officeqa')
+  })
+
+  it('routes JSON for browsecomp', () => {
+    const json = JSON.stringify([
+      { question: 'Q1', answer: 'A1' },
+      { question: 'Q2', answer: 'A2' },
+      { question: 'Q3', answer: 'A3' },
+      { question: 'Q4', answer: 'A4' },
+      { question: 'Q5', answer: 'A5' },
+    ])
+    const result = loadDataset(json, 'browsecomp', { seed: 42 })
+    expect(result.tasks[0].benchmark).toBe('browsecomp')
+  })
+})
diff --git a/packages/core/tests/benchmarks/evaluator.test.ts b/packages/core/tests/benchmarks/evaluator.test.ts
new file mode 100644
index 00000000..597143b1
--- /dev/null
+++ b/packages/core/tests/benchmarks/evaluator.test.ts
@@ -0,0 +1,151 @@
+import { describe, it, expect } from 'vitest'
+import { evaluate, aggregateSeeds } from '../../src/benchmarks/evoskill/evaluator.js'
+import type { BenchmarkTask, EvoSkillBenchmarkResult } from '../../src/benchmarks/evoskill/types.js'
+import type { TaskResult } from '../../src/benchmarks/evoskill/agent-runner.js'
+
+const makeTasks = (n: number): BenchmarkTask[] =>
+  Array.from({ length: n }, (_, i) => ({
+    id: `test-${i + 1}`,
+    question: `Question ${i + 1}`,
+    groundTruth: `Answer ${i + 1}`,
+    split: 'test' as const,
+    benchmark: 'officeqa' as const,
+  }))
+
+const makeResults = (tasks: BenchmarkTask[], correctIds: Set<string>): TaskResult[] =>
+  tasks.map((t) => ({
+    taskId: t.id,
+    predicted: correctIds.has(t.id) ? t.groundTruth : 'wrong',
+    tokens: { inputTokens: 100, outputTokens: 50 },
+    durationMs: 500,
+  }))
+
+describe('evaluate', () => {
+  it('computes accuracy correctly', async () => {
+    const tasks = makeTasks(10)
+    const correct = new Set(['test-1', 'test-2', 'test-3'])
+    const results = makeResults(tasks, correct)
+
+    const result = await evaluate(tasks, results, {
+      scorer: (_q, predicted, groundTruth) => (predicted === groundTruth ? 1.0 : 0.0),
+      condition: 'baseline',
+      benchmark: 'officeqa',
+      split: 'test',
+      modelId: 'claude-sonnet-4-6',
+    })
+
+    expect(result.accuracy).toBeCloseTo(0.3)
+    expect(result.correctCount).toBe(3)
+    expect(result.taskCount).toBe(10)
+  })
+
+  it('handles all correct', async () => {
+    const tasks = makeTasks(5)
+    const allCorrect = new Set(tasks.map((t) => t.id))
+    const results = makeResults(tasks, allCorrect)
+
+    const result = await evaluate(tasks, results, {
+      scorer: (_q, predicted, groundTruth) => (predicted === groundTruth ? 1.0 : 0.0),
+      condition: 'test',
+      benchmark: 'officeqa',
+      split: 'test',
+      modelId: 'claude-sonnet-4-6',
+    })
+
+    expect(result.accuracy).toBe(1.0)
+    expect(result.correctCount).toBe(5)
+  })
+
+  it('handles all wrong', async () => {
+    const tasks = makeTasks(5)
+    const results = makeResults(tasks, new Set())
+
+    const result = await evaluate(tasks, results, {
+      scorer: () => 0.0,
+      condition: 'test',
+      benchmark: 'officeqa',
+      split: 'test',
+      modelId: 'claude-sonnet-4-6',
+    })
+
+    expect(result.accuracy).toBe(0)
+    expect(result.correctCount).toBe(0)
+  })
+
+  it('sums token costs', async () => {
+    const tasks = makeTasks(3)
+    const results = makeResults(tasks, new Set())
+
+    const result = await evaluate(tasks, results, {
+      scorer: () => 0.0,
+      condition: 'test',
+      benchmark: 'officeqa',
+      split: 'test',
+      modelId: 'claude-sonnet-4-6',
+    })
+
+    // 3 tasks × (100 input + 50 output) = 450 total tokens
+    expect(result.costTokens).toBe(450)
+    expect(result.costDollars).toBeGreaterThan(0)
+  })
+
+  it('handles error results gracefully', async () => {
+    const tasks = makeTasks(2)
+    const results: TaskResult[] = [
+      { taskId: 'test-1', predicted: '', tokens: { inputTokens: 0, outputTokens: 0 }, durationMs: 100, error: 'timeout' },
+      { taskId: 'test-2', predicted: 'Answer 2', tokens: { inputTokens: 100, outputTokens: 50 }, durationMs: 500 },
+    ]
+
+    const result = await evaluate(tasks, results, {
+      scorer: (_q, predicted, groundTruth) => (predicted === groundTruth ? 1.0 : 0.0),
+      condition: 'test',
+      benchmark: 'officeqa',
+      split: 'test',
+      modelId: 'claude-sonnet-4-6',
+    })
+
+    expect(result.correctCount).toBe(1)
+    expect(result.taskCount).toBe(2)
+  })
+})
+
+describe('aggregateSeeds', () => {
+  const makeResult = (accuracy: number, cost: number): EvoSkillBenchmarkResult => ({
+    condition: 'baseline',
+    benchmark: 'officeqa',
+    split: 'test',
+    accuracy,
+    taskCount: 100,
+    correctCount: Math.round(accuracy * 100),
+    costTokens: 1000,
+    costDollars: cost,
+    wallClockMs: 5000,
+  })
+
+  it('returns single result unchanged (no std)', () => {
+    const result = aggregateSeeds([makeResult(0.6, 1.5)])
+    expect(result.accuracy).toBe(0.6)
+    expect(result.accuracyStd).toBeUndefined()
+  })
+
+  it('computes mean and std for multiple seeds', () => {
+    const results = [makeResult(0.6, 1.0), makeResult(0.7, 1.2), makeResult(0.65, 1.1)]
+    const agg = aggregateSeeds(results)
+
+    expect(agg.accuracy).toBeCloseTo(0.65)
+    expect(agg.accuracyStd).toBeDefined()
+    expect(agg.accuracyStd!).toBeGreaterThan(0)
+  })
+
+  it('sums costs across seeds', () => {
+    const results = [makeResult(0.6, 1.0), makeResult(0.7, 1.5)]
+    const agg = aggregateSeeds(results)
+
+    expect(agg.costDollars).toBeCloseTo(2.5)
+    expect(agg.costTokens).toBe(2000)
+  })
+
+  it('throws for empty input', () => {
+    expect(() => aggregateSeeds([])).toThrow('Cannot aggregate 0 results')
+  })
+})
diff --git a/packages/core/tests/benchmarks/fixtures/evoskill-scorer-samples-dabstep.json b/packages/core/tests/benchmarks/fixtures/evoskill-scorer-samples-dabstep.json
new file mode 100644
index 00000000..cb8ac8d2
--- /dev/null
+++ b/packages/core/tests/benchmarks/fixtures/evoskill-scorer-samples-dabstep.json
@@ -0,0 +1,702 @@
+[
+  {
+    "question": "Q?",
+    "predicted": "4",
+    "groundTruth": "4",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "42",
+    "groundTruth": "42",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "3.14",
+    "groundTruth": "3.14",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "0.5",
+    "groundTruth": "0.5",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "1,000",
+    "groundTruth": "1,000",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "1000",
+    "groundTruth": "1,000",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "12,345.67",
+    "groundTruth": "12,345.67",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "12345.67",
+    "groundTruth": "12,345.67",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "paris",
+    "groundTruth": "Paris",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "PARIS",
+    "groundTruth": "Paris",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "shakespeare",
+    "groundTruth": "Shakespeare",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "A; B; C",
+    "groundTruth": "A; B; C",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "C; B; A",
+    "groundTruth": "A; B; C",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "A; B",
+    "groundTruth": "A; B; C",
+    "pythonScore": 0.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "A, B, C",
+    "groundTruth": "A, B, C",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "C, B, A",
+    "groundTruth": "A, B, C",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "100",
+    "groundTruth": "100",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "100.0",
+    "groundTruth": "100",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "99.99",
+    "groundTruth": "100",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "Shakespear",
+    "groundTruth": "Shakespeare",
+    "pythonScore": 0.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "wrong",
+    "groundTruth": "Shakespeare",
+    "pythonScore": 0.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "50%",
+    "groundTruth": "50%",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "50.0%",
+    "groundTruth": "50%",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "London",
+    "groundTruth": "Paris",
+    "pythonScore": 0.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "5",
+    "groundTruth": "4",
+    "pythonScore": 0.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "xyz",
+    "groundTruth": "42",
+    "pythonScore": 0.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "dabstep_scorer.question_scorer"
+  }
+]
\ No newline at end of file
diff --git a/packages/core/tests/benchmarks/fixtures/evoskill-scorer-samples-officeqa.json b/packages/core/tests/benchmarks/fixtures/evoskill-scorer-samples-officeqa.json
new file mode 100644
index 00000000..1e5f7b0a
--- /dev/null
+++ b/packages/core/tests/benchmarks/fixtures/evoskill-scorer-samples-officeqa.json
@@ -0,0 +1,702 @@
+[
+  {
+    "question": "What is 2+2?",
+    "predicted": "4",
+    "groundTruth": "4",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Capital of France?",
+    "predicted": "Paris",
+    "groundTruth": "Paris",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Who wrote Hamlet?",
+    "predicted": "Shakespeare",
+    "groundTruth": "Shakespeare",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Who?",
+    "predicted": "paris",
+    "groundTruth": "Paris",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Who?",
+    "predicted": "PARIS",
+    "groundTruth": "Paris",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Who?",
+    "predicted": "shakespeare",
+    "groundTruth": "Shakespeare",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": " 4 ",
+    "groundTruth": "4",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "\"Paris\"",
+    "groundTruth": "Paris",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "'Paris'",
+    "groundTruth": "Paris",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "42",
+    "groundTruth": "42",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "3.14",
+    "groundTruth": "3.14",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "0.5",
+    "groundTruth": "0.5",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "100",
+    "groundTruth": "100",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "1,000",
+    "groundTruth": "1000",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "1,234,567",
+    "groundTruth": "1234567",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "12,345.67",
+    "groundTruth": "12345.67",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "100",
+    "groundTruth": "100",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "101",
+    "groundTruth": "100",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "105",
+    "groundTruth": "100",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "106",
+    "groundTruth": "100",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "200",
+    "groundTruth": "100",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "-5",
+    "groundTruth": "-5",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "-5.5",
+    "groundTruth": "-5.5",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "50%",
+    "groundTruth": "50%",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "3.5%",
+    "groundTruth": "3.5%",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "543 million",
+    "groundTruth": "543 million",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "2 billion",
+    "groundTruth": "2 billion",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "1.5 trillion",
+    "groundTruth": "1.5 trillion",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "When?",
+    "predicted": "March 1977",
+    "groundTruth": "March 1977",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "When?",
+    "predicted": "April 1977",
+    "groundTruth": "March 1977",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "When?",
+    "predicted": "1977",
+    "groundTruth": "March 1977",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "When?",
+    "predicted": "March 1978",
+    "groundTruth": "March 1977",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Who?",
+    "predicted": "The answer is Paris",
+    "groundTruth": "Paris",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Who?",
+    "predicted": "I think it's Shakespeare",
+    "groundTruth": "Shakespeare",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Capital?",
+    "predicted": "London",
+    "groundTruth": "Paris",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "What?",
+    "predicted": "5",
+    "groundTruth": "4",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Who?",
+    "predicted": "Dickens",
+    "groundTruth": "Shakespeare",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "What?",
+    "predicted": "completely wrong answer",
+    "groundTruth": "42",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "The answer is 0",
+    "groundTruth": "0",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "What?",
+    "predicted": "Federal Old-Age and Survivors Insurance (OASI) Trust Fund",
+    "groundTruth": "Federal Old-Age and Survivors Insurance Trust Fund",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "When?",
+    "predicted": "2003",
+    "groundTruth": "2003",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "When?",
+    "predicted": "1999",
+    "groundTruth": "1999",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Q?",
+    "predicted": "The values are 10 and 20",
+    "groundTruth": "10 and 20",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many Kannada films have grossed at least \u20b9100 crore worldwide and also earned at least \u20b950 crore",
+    "predicted": "6",
+    "groundTruth": "6",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many Kannada films have grossed at least \u20b9100 crore worldwide and also earned at least \u20b950 crore",
+    "predicted": "wrong answer",
+    "groundTruth": "6",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "What was the average 911 call answer time in Washington D.C. on April 30, 2025?",
+    "predicted": "3 seconds",
+    "groundTruth": "3 seconds",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "What was the average 911 call answer time in Washington D.C. on April 30, 2025?",
+    "predicted": "wrong answer",
+    "groundTruth": "3 seconds",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many YouTube videos have surpassed 4 billion views?",
+    "predicted": "28",
+    "groundTruth": "28",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many YouTube videos have surpassed 4 billion views?",
+    "predicted": "wrong answer",
+    "groundTruth": "28",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many videos on YouTube have received more than 29 million likes?",
+    "predicted": "27",
+    "groundTruth": "27",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many videos on YouTube have received more than 29 million likes?",
+    "predicted": "wrong answer",
+    "groundTruth": "27",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many NBA players have scored 60 or more points in a regular season game since 2023?",
+    "predicted": "12 players",
+    "groundTruth": "12 players",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many NBA players have scored 60 or more points in a regular season game since 2023?",
+    "predicted": "wrong answer",
+    "groundTruth": "12 players",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many YouTube music videos have surpassed 7 billion views?",
+    "predicted": "5",
+    "groundTruth": "5",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many YouTube music videos have surpassed 7 billion views?",
+    "predicted": "wrong answer",
+    "groundTruth": "5",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many Studio Ghibli feature films have received a Rotten Tomatoes Tomatometer score below 90%?",
+    "predicted": "6",
+    "groundTruth": "6",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many Studio Ghibli feature films have received a Rotten Tomatoes Tomatometer score below 90%?",
+    "predicted": "wrong answer",
+    "groundTruth": "6",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many hotels in the United States have more than 3,500 rooms?",
+    "predicted": "13",
+    "groundTruth": "13",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many hotels in the United States have more than 3,500 rooms?",
+    "predicted": "wrong answer",
+    "groundTruth": "13",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many badminton athletes have won Olympic Gold and more than 2 Olympic medals in total?",
+    "predicted": "8",
+    "groundTruth": "8",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many badminton athletes have won Olympic Gold and more than 2 Olympic medals in total?",
+    "predicted": "wrong answer",
+    "groundTruth": "8",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "If I randomly select i.i.d. samples from a distribution n times, the distribution of the sampled var",
+    "predicted": "There is no theory that supports this statement.",
+    "groundTruth": "There is no theory that supports this statement.",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "If I randomly select i.i.d. samples from a distribution n times, the distribution of the sampled var",
+    "predicted": "wrong answer",
+    "groundTruth": "There is no theory that supports this statement.",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "What is the most widely used ride booking application in India that offers a wide variety of vehicle",
+    "predicted": "Uber",
+    "groundTruth": "Uber",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "What is the most widely used ride booking application in India that offers a wide variety of vehicle",
+    "predicted": "wrong answer",
+    "groundTruth": "Uber",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Among artists with multiple wins for Album of the Year at the Grammys, who has received the most nom",
+    "predicted": "George Harrison",
+    "groundTruth": "George Harrison",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Among artists with multiple wins for Album of the Year at the Grammys, who has received the most nom",
+    "predicted": "wrong answer",
+    "groundTruth": "George Harrison",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many national flags of countries recognized as United Nations (UN) member or observer states hav",
+    "predicted": "16",
+    "groundTruth": "16",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many national flags of countries recognized as United Nations (UN) member or observer states hav",
+    "predicted": "wrong answer",
+    "groundTruth": "16",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many countries did both Donald Trump (during his first term) and Joe Biden visit during their pr",
+    "predicted": "18",
+    "groundTruth": "18",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How many countries did both Donald Trump (during his first term) and Joe Biden visit during their pr",
+    "predicted": "wrong answer",
+    "groundTruth": "18",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How much did the infant mortality rate (per 1,000 live births) decline in Northern Africa between 20",
+    "predicted": "8.7",
+    "groundTruth": "8.7",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "How much did the infant mortality rate (per 1,000 live births) decline in Northern Africa between 20",
+    "predicted": "wrong answer",
+    "groundTruth": "8.7",
+    "pythonScore": 0.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  },
+  {
+    "question": "Padding question?",
+    "predicted": "answer",
+    "groundTruth": "answer",
+    "pythonScore": 1.0,
+    "scorer": "reward.score_answer"
+  }
+]
\ No newline at end of file
diff --git a/packages/core/tests/benchmarks/harness.test.ts b/packages/core/tests/benchmarks/harness.test.ts
new file mode 100644
index 00000000..033e47a9
--- /dev/null
+++ b/packages/core/tests/benchmarks/harness.test.ts
@@ -0,0 +1,117 @@
+import { describe, it, expect } from 'vitest'
+import { runHarness } from '../../src/benchmarks/evoskill/harness.js'
+import type { HarnessConfig } from '../../src/benchmarks/evoskill/types.js'
+import type { HarnessDependencies, HarnessProgressEvent } from '../../src/benchmarks/evoskill/harness.js'
+
+// Minimal CSV dataset for testing
+const TEST_CSV = [
+  'question,answer',
+  'Q1?,A1',
+  'Q2?,A2',
+  'Q3?,A3',
+  'Q4?,A4',
+  'Q5?,A5',
+  'Q6?,A6',
+  'Q7?,A7',
+  'Q8?,A8',
+  'Q9?,A9',
+  'Q10?,A10',
+].join('\n')
+
+function createMockDeps(): HarnessDependencies {
+  return {
+    agentClient: {
+      async runTask() {
+        return { content: 'A1', inputTokens: 50, outputTokens: 20 }
+      },
+    },
+    getScorer: () => (_q: string, predicted: string, groundTruth: string) =>
+      predicted === groundTruth ? 1.0 : 0.0,
+    readFile: async () => TEST_CSV,
+  }
+}
+
+describe('runHarness', () => {
+  it('runs dry-run mode without API calls', async () => {
+    const config: HarnessConfig = {
+      benchmarks: ['officeqa'],
+      conditions: [
+        { name: 'baseline', skillSelector: async () => [], modelId: 'claude-sonnet-4-6', seed: 42 },
+      ],
+      seeds: [42],
+      sampleFraction: 1.0,
+      datasetDir: '/tmp',
+      outputDir: '/tmp/results',
+      dryRun: true,
+    }
+
+    const result = await runHarness(config, createMockDeps())
+
+    expect(result.results).toHaveLength(1)
+    expect(result.results[0].accuracy).toBe(0)
+    expect(result.results[0].costTokens).toBe(0)
+  })
+
+  it('emits progress events', async () => {
+    const events: HarnessProgressEvent[] = []
+    const config: HarnessConfig = {
+      benchmarks: ['officeqa'],
+      conditions: [
+        { name: 'test', skillSelector: async () => [], modelId: 'claude-sonnet-4-6', seed: 42 },
+      ],
+      seeds: [42],
+      sampleFraction: 1.0,
+      datasetDir: '/tmp',
+      outputDir: '/tmp/results',
+      dryRun: true,
+    }
+
+    await runHarness(config, createMockDeps(), (e) => events.push(e))
+
+    const types = events.map((e) => e.type)
+    expect(types).toContain('seed_start')
+    expect(types).toContain('condition_start')
+    expect(types).toContain('condition_complete')
+    expect(types).toContain('seed_complete')
+    expect(types).toContain('harness_complete')
+  })
+
+  it('uses different seeds for dataset splits', async () => {
+    const config: HarnessConfig = {
+      benchmarks: ['officeqa'],
+      conditions: [
+        { name: 'baseline', skillSelector: async () => [], modelId: 'claude-sonnet-4-6', seed: 42 },
+      ],
+      seeds: [42, 43],
+      sampleFraction: 1.0,
+      datasetDir: '/tmp',
+      outputDir: '/tmp/results',
+      dryRun: true,
+    }
+
+    const result = await runHarness(config, createMockDeps())
+    // Two seeds × one condition = 2 results
+    expect(result.results).toHaveLength(2)
+    // Aggregated should collapse to 1
+    expect(result.aggregated).toHaveLength(1)
+  })
+
+  it('applies sample fraction', async () => {
+    const config: HarnessConfig = {
+      benchmarks: ['officeqa'],
+      conditions: [
+        { name: 'test', skillSelector: async () => [], modelId: 'claude-sonnet-4-6', seed: 42 },
+      ],
+      seeds: [42],
+      sampleFraction: 0.5,
+      datasetDir: '/tmp',
+      outputDir: '/tmp/results',
+      dryRun: true,
+    }
+
+    const result = await runHarness(config, createMockDeps())
+    // 10 rows × 70% test × 50% sample ≈ 3-4 tasks
+    expect(result.results[0].taskCount).toBeLessThan(7)
+    expect(result.results[0].taskCount).toBeGreaterThan(0)
+  })
+})
diff --git a/packages/core/tests/benchmarks/ir-metrics.test.ts b/packages/core/tests/benchmarks/ir-metrics.test.ts
new file mode 100644
index 00000000..de7c14b9
--- /dev/null
+++ b/packages/core/tests/benchmarks/ir-metrics.test.ts
@@ -0,0 +1,186 @@
+import { describe, it, expect } from 'vitest'
+import { ndcg, mrr, mapAtK, precisionAtK, recallAtK } from '../../src/benchmarks/evoskill/ir-metrics.js'
+
+describe('IR Metrics', () => {
+  describe('nDCG', () => {
+    it('returns 1.0 for perfect ranking', () => {
+      const ranked = ['a', 'b', 'c']
+      const relevance = new Map([
+        ['a', 3],
+        ['b', 2],
+        ['c', 1],
+      ])
+      expect(ndcg(ranked, relevance, 3)).toBeCloseTo(1.0, 5)
+    })
+
+    it('returns less than 1.0 for imperfect ranking', () => {
+      const ranked = ['c', 'a', 'b']
+      const relevance = new Map([
+        ['a', 3],
+        ['b', 2],
+        ['c', 1],
+      ])
+      const score = ndcg(ranked, relevance, 3)
+      expect(score).toBeGreaterThan(0)
+      expect(score).toBeLessThan(1.0)
+    })
+
+    it('handles k smaller than ranked list', () => {
+      const ranked = ['a', 'b', 'c', 'd']
+      const relevance = new Map([
+        ['a', 3],
+        ['b', 2],
+        ['c', 1],
+        ['d', 0],
+      ])
+      const score = ndcg(ranked, relevance, 2)
+      expect(score).toBeCloseTo(1.0, 5) // top-2 are already in ideal order
+    })
+
+    it('returns 0 for empty results', () => {
+      expect(ndcg([], new Map([['a', 1]]), 5)).toBe(0)
+    })
+
+    it('returns 0 for empty relevance map', () => {
+      expect(ndcg(['a', 'b'], new Map(), 5)).toBe(0)
+    })
+
+    it('returns 0 when no ranked items have relevance', () => {
+      const ranked = ['x', 'y']
+      const relevance = new Map([['a', 3]])
+      expect(ndcg(ranked, relevance, 2)).toBe(0)
+    })
+
+    // Known-answer from IR textbook (Manning et al., Introduction to IR)
+    it('computes correct nDCG@5 for textbook example', () => {
+      // Example: ranked results with graded relevance 3, 2, 3, 0, 1
+      const ranked = ['d1', 'd2', 'd3', 'd4', 'd5']
+      const relevance = new Map([
+        ['d1', 3],
+        ['d2', 2],
+        ['d3', 3],
+        ['d4', 0],
+        ['d5', 1],
+      ])
+      // DCG@5 = 3/log2(2) + 2/log2(3) + 3/log2(4) + 0/log2(5) + 1/log2(6)
+      //       = 3/1 + 2/1.585 + 3/2 + 0 + 1/2.585
+      //       = 3 + 1.262 + 1.5 + 0 + 0.387 = 6.149
+      // Ideal: 3, 3, 2, 1, 0
+      // IDCG@5 = 3/1 + 3/1.585 + 2/2 + 1/2.322 + 0 = 3 + 1.893 + 1 + 0.431 = 6.324
+      // nDCG@5 = 6.149 / 6.324 ≈ 0.972
+      const score = ndcg(ranked, relevance, 5)
+      expect(score).toBeCloseTo(0.972, 2)
+    })
+  })
+
+  describe('MRR', () => {
+    it('returns 1.0 when first result is relevant', () => {
+      expect(mrr(['a', 'b', 'c'], new Set(['a']))).toBe(1.0)
+    })
+
+    it('returns 0.5 when second result is first relevant', () => {
+      expect(mrr(['b', 'a', 'c'], new Set(['a']))).toBe(0.5)
+    })
+
+    it('returns 1/3 when third result is first relevant', () => {
+      expect(mrr(['x', 'y', 'a'], new Set(['a']))).toBeCloseTo(1 / 3, 5)
+    })
+
+    it('returns 0 when no results are relevant', () => {
+      expect(mrr(['x', 'y', 'z'], new Set(['a']))).toBe(0)
+    })
+
+    it('returns 0 for empty results', () => {
+      expect(mrr([], new Set(['a']))).toBe(0)
+    })
+
+    it('returns 0 for empty relevant set', () => {
+      expect(mrr(['a', 'b'], new Set())).toBe(0)
+    })
+
+    it('returns 1.0 when all results are relevant', () => {
+      expect(mrr(['a', 'b', 'c'], new Set(['a', 'b', 'c']))).toBe(1.0)
+    })
+  })
+
+  describe('MAP@k', () => {
+    it('returns 1.0 for perfect ranking with all relevant', () => {
+      const ranked = ['a', 'b']
+      const relevant = new Set(['a', 'b'])
+      // P@1 = 1/1 (hit), P@2 = 2/2 (hit) → AP = (1 + 1) / 2 = 1.0
+      expect(mapAtK(ranked, relevant, 2)).toBeCloseTo(1.0, 5)
+    })
+
+    it('penalizes late relevant results', () => {
+      const ranked = ['x', 'a', 'y', 'b']
+      const relevant = new Set(['a', 'b'])
+      // P@2 = 1/2 (hit at pos 2), P@4 = 2/4 (hit at pos 4)
+      // AP = (0.5 + 0.5) / 2 = 0.5
+      expect(mapAtK(ranked, relevant, 4)).toBeCloseTo(0.5, 5)
+    })
+
+    it('returns 0 when no results are relevant', () => {
+      expect(mapAtK(['x', 'y'], new Set(['a']), 2)).toBe(0)
+    })
+
+    it('returns 0 for empty inputs', () => {
+      expect(mapAtK([], new Set(['a']), 5)).toBe(0)
+      expect(mapAtK(['a'], new Set(), 5)).toBe(0)
+    })
+
+    it('handles k larger than result list', () => {
+      const ranked = ['a']
+      const relevant = new Set(['a', 'b'])
+      // Only 1 result, it's relevant: P@1 = 1/1 → AP = 1/2 (normalize by relevant.size=2)
+      expect(mapAtK(ranked, relevant, 10)).toBeCloseTo(0.5, 5)
+    })
+  })
+
+  describe('Precision@k', () => {
+    it('returns 1.0 when all top-k are relevant', () => {
+      expect(precisionAtK(['a', 'b'], new Set(['a', 'b', 'c']), 2)).toBe(1.0)
+    })
+
+    it('returns 0.5 when half of top-k are relevant', () => {
+      expect(precisionAtK(['a', 'x'], new Set(['a']), 2)).toBe(0.5)
+    })
+
+    it('returns 0 when none are relevant', () => {
+      expect(precisionAtK(['x', 'y'], new Set(['a']), 2)).toBe(0)
+    })
+
+    it('handles k larger than result list', () => {
+      // k=5 but only 2 results, 1 relevant → 1/2
+      expect(precisionAtK(['a', 'x'], new Set(['a']), 5)).toBe(0.5)
+    })
+
+    it('returns 0 for empty inputs', () => {
+      expect(precisionAtK([], new Set(['a']), 5)).toBe(0)
+      expect(precisionAtK(['a'], new Set(), 5)).toBe(0)
+    })
+  })
+
+  describe('Recall@k', () => {
+    it('returns 1.0 when all relevant items are in top-k', () => {
+      expect(recallAtK(['a', 'b', 'x'], new Set(['a', 'b']), 3)).toBe(1.0)
+    })
+
+    it('returns 0.5 when half of relevant items are in top-k', () => {
+      expect(recallAtK(['a', 'x'], new Set(['a', 'b']), 2)).toBe(0.5)
+    })
+
+    it('returns 0 when no relevant items are in top-k', () => {
+      expect(recallAtK(['x', 'y'], new Set(['a', 'b']), 2)).toBe(0)
+    })
+
+    it('returns 0 for empty inputs', () => {
+      expect(recallAtK([], new Set(['a']), 5)).toBe(0)
+      expect(recallAtK(['a'], new Set(), 5)).toBe(0)
+    })
+
+    it('returns correct ratio for single relevant item', () => {
+      expect(recallAtK(['x', 'a', 'y'], new Set(['a']), 3)).toBe(1.0)
+      expect(recallAtK(['x', 'y', 'z'], new Set(['a']), 3)).toBe(0)
+    })
+  })
+})
diff --git a/packages/core/tests/benchmarks/report.test.ts b/packages/core/tests/benchmarks/report.test.ts
new file mode 100644
index 00000000..f2eaebf5
--- /dev/null
+++ b/packages/core/tests/benchmarks/report.test.ts
@@ -0,0 +1,123 @@
+import { describe, it, expect } from 'vitest'
+import { generateMarkdownReport, generateJsonReport } from '../../src/benchmarks/evoskill/report.js'
+import type { HarnessResult } from '../../src/benchmarks/evoskill/harness.js'
+import type { EvoSkillBenchmarkResult } from '../../src/benchmarks/evoskill/types.js'
+
+function makeResult(overrides: Partial<EvoSkillBenchmarkResult> = {}): EvoSkillBenchmarkResult {
+  return {
+    condition: 'baseline',
+    benchmark: 'officeqa',
+    split: 'test',
+    accuracy: 0.6,
+    taskCount: 100,
+    correctCount: 60,
+    costTokens: 50000,
+    costDollars: 1.5,
+    wallClockMs: 30000,
+    ...overrides,
+  }
+}
+
+function makeHarnessResult(results: EvoSkillBenchmarkResult[]): HarnessResult {
+  return {
+    results,
+    aggregated: results,
+    wallClockMs: 60000,
+  }
+}
+
+describe('generateMarkdownReport', () => {
+  it('generates a valid markdown table', () => {
+    const result = makeHarnessResult([
+      makeResult({ condition: 'baseline', benchmark: 'officeqa', accuracy: 0.6 }),
+      makeResult({ condition: 'search', benchmark: 'officeqa', accuracy: 0.7 }),
+    ])
+
+    const md = generateMarkdownReport(result)
+    expect(md).toContain('# EvoSkill Benchmark Results')
+    expect(md).toContain('| baseline')
+    expect(md).toContain('| search')
+    expect(md).toContain('60.0%')
+    expect(md).toContain('70.0%')
+  })
+
+  it('formats accuracy with std when present', () => {
+    const result = makeHarnessResult([
+      makeResult({ accuracy: 0.65, accuracyStd: 0.03 }),
+    ])
+
+    const md = generateMarkdownReport(result)
+    expect(md).toContain('65.0 ± 3.0%')
+  })
+
+  it('renders dash for missing benchmarks', () => {
+    const result = makeHarnessResult([
+      makeResult({ benchmark: 'officeqa' }),
+    ])
+
+    const md = generateMarkdownReport(result)
+    // sealqa and browsecomp columns should have dashes
+    expect(md).toContain('—')
+  })
+
+  it('includes Pareto frontier section', () => {
+    const result = makeHarnessResult([
+      makeResult({ condition: 'cheap', accuracy: 0.5, costDollars: 0.5 }),
+      makeResult({ condition: 'expensive', accuracy: 0.9, costDollars: 5.0 }),
+    ])
+
+    const md = generateMarkdownReport(result)
+    expect(md).toContain('Pareto Frontier')
+    expect(md).toContain('Pareto-Optimal')
+  })
+
+  it('includes IR metrics table when present', () => {
+    const result = makeHarnessResult([
+      makeResult({ irMetrics: { ndcg5: 0.85, mrr: 0.9, map5: 0.75 } }),
+    ])
+
+    const md = generateMarkdownReport(result)
+    expect(md).toContain('IR Metrics')
+    expect(md).toContain('0.850')
+  })
+
+  it('accepts custom title', () => {
+    const result = makeHarnessResult([makeResult()])
+    const md = generateMarkdownReport(result, { title: 'Custom Title' })
+    expect(md).toContain('# Custom Title')
+  })
+})
+
+describe('generateJsonReport', () => {
+  it('generates valid JSON', () => {
+    const result = makeHarnessResult([makeResult()])
+    const json = generateJsonReport(result)
+    const parsed = JSON.parse(json)
+
+    expect(parsed.generatedAt).toBeDefined()
+    expect(parsed.wallClockMs).toBe(60000)
+    expect(parsed.aggregated).toHaveLength(1)
+    expect(parsed.results).toHaveLength(1)
+  })
+
+  it('omits accuracyStd when undefined', () => {
+    const result = makeHarnessResult([makeResult()])
+    const json = generateJsonReport(result)
+    const parsed = JSON.parse(json)
+
+    expect(parsed.results[0].accuracyStd).toBeUndefined()
+  })
+
+  it('includes Pareto frontier', () => {
+    const result = makeHarnessResult([
+      makeResult({ condition: 'a', accuracy: 0.9, costDollars: 1.0 }),
+      makeResult({ condition: 'b', accuracy: 0.5, costDollars: 2.0 }),
+    ])
+    const json = generateJsonReport(result)
+    const parsed = JSON.parse(json)
+
+    expect(parsed.paretoFrontier.length).toBeGreaterThan(0)
+    // 'a' dominates 'b' (higher accuracy, lower cost)
+    expect(parsed.paretoFrontier[0].condition).toBe('a')
+  })
+})
diff --git a/packages/core/tests/benchmarks/scorer-validation.test.ts b/packages/core/tests/benchmarks/scorer-validation.test.ts
new file mode 100644
index 00000000..842970d2
--- /dev/null
+++ b/packages/core/tests/benchmarks/scorer-validation.test.ts
@@ -0,0 +1,185 @@
+import { describe, it, expect } from 'vitest'
+import { readFileSync } from 'fs'
+import { join } from 'path'
+import { exactMatchScorer } from '../../src/benchmarks/evoskill/scorers.js'
+
+interface FixtureSample {
+  question: string
+  predicted: string
+  groundTruth: string
+  pythonScore: number
+  scorer: string
+}
+
+function loadFixtures(filename: string): FixtureSample[] {
+  const filePath = join(__dirname, 'fixtures', filename)
+  return JSON.parse(readFileSync(filePath, 'utf-8'))
+}
+
+/**
+ * Known divergences between TypeScript exactMatchScorer and Python scorers.
+ * These are documented and accepted differences in scoring behavior.
+ *
+ * - Python reward.py supports substring matching ("Paris" in "The answer is Paris")
+ *   TypeScript requires exact match after normalization.
+ * - Python reward.py strips parentheticals for text comparison
+ *   TypeScript does not.
+ * - Python dabstep_scorer uses SequenceMatcher (>0.95 similarity)
+ *   TypeScript does not do fuzzy string matching.
+ * - Python dabstep_scorer supports list reordering (semicolon/comma separated)
+ *   TypeScript does not.
+ */
+
+describe('Cross-validate OfficeQA scorer against Python (reward.py)', () => {
+  const fixtures = loadFixtures('evoskill-scorer-samples-officeqa.json')
+
+  it('has 100 fixture samples', () => {
+    expect(fixtures).toHaveLength(100)
+  })
+
+  it('has a mix of correct and incorrect samples', () => {
+    const correct = fixtures.filter((f) => f.pythonScore === 1.0).length
+    const incorrect = fixtures.filter((f) => f.pythonScore === 0.0).length
+    expect(correct).toBeGreaterThan(20)
+    expect(incorrect).toBeGreaterThan(5)
+  })
+
+  // Known divergences where Python matches but TypeScript doesn't (or vice versa)
+  const KNOWN_DIVERGENCES = new Set([
+    // Python reward.py supports substring matching; TypeScript does not
+    'The answer is Paris|Paris',
+    "I think it's Shakespeare|Shakespeare",
+    'The answer is 0|0',
+    'The values are 10 and 20|10 and 20',
+    // Python strips parentheticals; TypeScript does not
+    'Federal Old-Age and Survivors Insurance (OASI) Trust Fund|Federal Old-Age and Survivors Insurance Trust Fund',
+  ])
+
+  it('diverges ≤5% from Python scorer (excluding known divergences)', () => {
+    let disagreements = 0
+    const diverged: string[] = []
+
+    for (const sample of fixtures) {
+      const key = `${sample.predicted}|${sample.groundTruth}`
+      if (KNOWN_DIVERGENCES.has(key)) continue
+
+      const tsScore = exactMatchScorer(sample.question, sample.predicted, sample.groundTruth)
+      const pyScore = sample.pythonScore
+
+      if ((tsScore >= 0.5 ? 1 : 0) !== (pyScore >= 0.5 ? 1 : 0)) {
+        disagreements++
+        diverged.push(
+          `predicted=${JSON.stringify(sample.predicted)} gt=${JSON.stringify(sample.groundTruth)} ts=${tsScore} py=${pyScore}`
+        )
+      }
+    }
+
+    const effectiveTotal = fixtures.length - KNOWN_DIVERGENCES.size
+    const divergenceRate = disagreements / effectiveTotal
+
+    if (diverged.length > 0) {
+      console.log(`Divergences (${diverged.length}):`)
+      for (const d of diverged) console.log(`  ${d}`)
+    }
+
+    expect(divergenceRate).toBeLessThanOrEqual(0.05)
+  })
+
+  it('agrees on exact-match cases', () => {
+    const exactCases = fixtures.filter(
+      (f) => f.predicted.trim().toLowerCase() === f.groundTruth.trim().toLowerCase()
+    )
+    expect(exactCases.length).toBeGreaterThan(10)
+
+    for (const sample of exactCases) {
+      const tsScore = exactMatchScorer(sample.question, sample.predicted, sample.groundTruth)
+      expect(tsScore).toBe(1.0)
+    }
+  })
+
+  it('agrees on clearly wrong answers', () => {
+    const wrongCases = fixtures.filter((f) => f.predicted === 'wrong answer')
+    expect(wrongCases.length).toBeGreaterThan(5)
+
+    for (const sample of wrongCases) {
+      const tsScore = exactMatchScorer(sample.question, sample.predicted, sample.groundTruth)
+      expect(tsScore).toBe(0.0)
+    }
+  })
+})
+
+describe('Cross-validate DABStep scorer against Python (dabstep_scorer)', () => {
+  const fixtures = loadFixtures('evoskill-scorer-samples-dabstep.json')
+
+  it('has 100 fixture samples', () => {
+    expect(fixtures).toHaveLength(100)
+  })
+
+  // DABStep-specific divergences
+  const KNOWN_DIVERGENCES = new Set([
+    // Python dabstep_scorer supports list reordering; TypeScript does not
+    'C; B; A|A; B; C',
+    'C, B, A|A, B, C',
+    // Python dabstep_scorer uses SequenceMatcher (>0.95); TypeScript does not
+    'Shakespear|Shakespeare',
+    // TypeScript splits ground truth by ', ' as alternatives; DABStep treats as list
+    'A, B, C|A, B, C',
+    // Python dabstep_scorer uses math.isclose(rel_tol=1e-4); TypeScript uses absolute ±0.01
+    '99.99|100',
+  ])
+
+  it('diverges ≤5% from Python scorer (excluding known divergences)', () => {
+    let disagreements = 0
+    const diverged: string[] = []
+
+    for (const sample of fixtures) {
+      const key = `${sample.predicted}|${sample.groundTruth}`
+      if (KNOWN_DIVERGENCES.has(key)) continue
+
+      const tsScore = exactMatchScorer(sample.question, sample.predicted, sample.groundTruth)
+      const pyScore = sample.pythonScore
+
+      if ((tsScore >= 0.5 ? 1 : 0) !== (pyScore >= 0.5 ? 1 : 0)) {
+        disagreements++
+        diverged.push(
+          `predicted=${JSON.stringify(sample.predicted)} gt=${JSON.stringify(sample.groundTruth)} ts=${tsScore} py=${pyScore}`
+        )
+      }
+    }
+
+    const effectiveTotal = fixtures.length - KNOWN_DIVERGENCES.size
+    const divergenceRate = disagreements / effectiveTotal
+
+    if (diverged.length > 0) {
+      console.log(`Divergences (${diverged.length}):`)
+      for (const d of diverged) console.log(`  ${d}`)
+    }
+
+    expect(divergenceRate).toBeLessThanOrEqual(0.05)
+  })
+
+  it('agrees on exact-match cases', () => {
+    // Skip list-pattern cases (contain commas or semicolons) where TS splits as alternatives
+    const exactCases = fixtures.filter(
+      (f) =>
+        f.predicted.trim().toLowerCase() === f.groundTruth.trim().toLowerCase() &&
+        !f.groundTruth.includes(', ') &&
+        !f.groundTruth.includes('; ')
+    )
+    expect(exactCases.length).toBeGreaterThan(10)
+
+    for (const sample of exactCases) {
+      const tsScore = exactMatchScorer(sample.question, sample.predicted, sample.groundTruth)
+      expect(tsScore).toBe(1.0)
+    }
+  })
+
+  it('agrees on clearly wrong answers', () => {
+    const wrongCases = fixtures.filter((f) => f.predicted === 'completely_wrong_answer_xyz')
+
+    for (const sample of wrongCases) {
+      const tsScore = exactMatchScorer(sample.question, sample.predicted, sample.groundTruth)
+      expect(tsScore).toBe(0.0)
+    }
+  })
+})
diff --git a/packages/core/tests/benchmarks/scorers.test.ts b/packages/core/tests/benchmarks/scorers.test.ts
new file mode 100644
index 00000000..f3098a76
--- /dev/null
+++ b/packages/core/tests/benchmarks/scorers.test.ts
@@ -0,0 +1,64 @@
+import { describe, it, expect } from 'vitest'
+import { exactMatchScorer } from '../../src/benchmarks/evoskill/scorers.js'
+
+describe('exactMatchScorer', () => {
+  const q = 'test question' // question is unused in exact-match
+
+  it('matches identical strings', () => {
+    expect(exactMatchScorer(q, 'hello', 'hello')).toBe(1.0)
+  })
+
+  it('matches case-insensitively', () => {
+    expect(exactMatchScorer(q, 'Hello', 'hello')).toBe(1.0)
+    expect(exactMatchScorer(q, 'HELLO', 'hello')).toBe(1.0)
+  })
+
+  it('strips trailing punctuation', () => {
+    expect(exactMatchScorer(q, 'hello.', 'hello')).toBe(1.0)
+    expect(exactMatchScorer(q, 'hello!', 'hello')).toBe(1.0)
+    expect(exactMatchScorer(q, 'hello?', 'hello')).toBe(1.0)
+  })
+
+  it('strips whitespace', () => {
+    expect(exactMatchScorer(q, '  hello  ', 'hello')).toBe(1.0)
+  })
+
+  it('handles numeric tolerance', () => {
+    expect(exactMatchScorer(q, '42.005', '42.00')).toBe(1.0)
+    expect(exactMatchScorer(q, '42.02', '42.00')).toBe(0.0)
+  })
+
+  it('handles with/without units', () => {
+    expect(exactMatchScorer(q, '42 kg', '42')).toBe(1.0)
+    expect(exactMatchScorer(q, '42', '42 kg')).toBe(1.0)
+  })
+
+  it('handles comma-separated alternatives in ground truth', () => {
+    expect(exactMatchScorer(q, 'foo', 'foo, bar, baz')).toBe(1.0)
+    expect(exactMatchScorer(q, 'bar', 'foo, bar, baz')).toBe(1.0)
+    expect(exactMatchScorer(q, 'baz', 'foo, bar, baz')).toBe(1.0)
+    expect(exactMatchScorer(q, 'qux', 'foo, bar, baz')).toBe(0.0)
+  })
+
+  it('handles commas in numbers', () => {
+    expect(exactMatchScorer(q, '1,000', '1000')).toBe(1.0)
+    expect(exactMatchScorer(q, '1000', '1,000')).toBe(1.0)
+  })
+
+  it('handles percentage sign', () => {
+    expect(exactMatchScorer(q, '42%', '42')).toBe(1.0)
+    expect(exactMatchScorer(q, '42', '42%')).toBe(1.0)
+  })
+
+  it('returns 0.0 for non-matching strings', () => {
+    expect(exactMatchScorer(q, 'hello', 'world')).toBe(0.0)
+  })
+
+  it('returns 0.0 for empty predicted', () => {
+    expect(exactMatchScorer(q, '', 'hello')).toBe(0.0)
+  })
+
+  it('handles numeric ground truth vs text predicted', () => {
+    expect(exactMatchScorer(q, 'forty-two', '42')).toBe(0.0)
+  })
+})
diff --git a/packages/core/tests/benchmarks/skill-selector.test.ts b/packages/core/tests/benchmarks/skill-selector.test.ts
new file mode 100644
index 00000000..a3e11b43
--- /dev/null
+++ b/packages/core/tests/benchmarks/skill-selector.test.ts
@@ -0,0 +1,129 @@
+import { describe, it, expect } from 'vitest'
+import {
+  createBaselineSelector,
+  createCuratedSelector,
+  createIterativeSelector,
+  createSearchSelector,
+  createRecommendSelector,
+  createOptimizedSelector,
+  createHybridSelector,
+  createEvoSkillEvolvedSelector,
+  NotImplementedError,
+  CONDITIONS,
+} from '../../src/benchmarks/evoskill/skill-selector.js'
+import type { BenchmarkTask } from '../../src/benchmarks/evoskill/types.js'
+
+const tasks: BenchmarkTask[] = [
+  { id: 't1', question: 'What is 2+2?', groundTruth: '4', split: 'test', benchmark: 'officeqa' },
+  { id: 't2', question: 'Capital of France?', groundTruth: 'Paris', split: 'test', benchmark: 'officeqa' },
+]
+
+describe('createBaselineSelector (condition 1)', () => {
+  it('returns empty array', async () => {
+    const selector = createBaselineSelector()
+    const skills = await selector(tasks)
+    expect(skills).toEqual([])
+  })
+})
+
+describe('createCuratedSelector (condition 9)', () => {
+  it('returns provided skill contents', async () => {
+    const skills = ['skill content 1', 'skill content 2']
+    const selector = createCuratedSelector(skills)
+    const result = await selector(tasks)
+    expect(result).toEqual(skills)
+  })
+
+  it('returns empty for empty input', async () => {
+    const selector = createCuratedSelector([])
+    const result = await selector(tasks)
+    expect(result).toEqual([])
+  })
+})
+
+describe('createSearchSelector (condition 3)', () => {
+  it('calls search client and returns top result', async () => {
+    const mockClient = {
+      search: async () => [
+        { content: 'best skill', score: 0.95 },
+        { content: 'second skill', score: 0.8 },
+      ],
+    }
+    const selector = createSearchSelector(mockClient)
+    const result = await selector(tasks)
+    expect(result).toEqual(['best skill'])
+  })
+
+  it('returns empty when search finds nothing', async () => {
+    const mockClient = { search: async () => [] }
+    const selector = createSearchSelector(mockClient)
+    const result = await selector(tasks)
+    expect(result).toEqual([])
+  })
+})
+
+describe('createRecommendSelector (condition 4)', () => {
+  it('calls recommend client and returns top result', async () => {
+    const mockClient = {
+      recommend: async () => [{ content: 'recommended skill', score: 0.9 }],
+    }
+    const selector = createRecommendSelector(mockClient)
+    const result = await selector(tasks)
+    expect(result).toEqual(['recommended skill'])
+  })
+})
+
+describe('createIterativeSelector (condition 7)', () => {
+  it('throws NotImplementedError', async () => {
+    const selector = createIterativeSelector()
+    await expect(selector(tasks)).rejects.toThrow(NotImplementedError)
+    await expect(selector(tasks)).rejects.toThrow('Study B')
+  })
+})
+
+describe('createOptimizedSelector (condition 5)', () => {
+  it('searches then optimizes the top result', async () => {
+    const mockSearch = { search: async () => [{ content: 'base skill', score: 0.9 }] }
+    const mockTransform = { optimize: async (s: string) => `optimized: ${s}` }
+    const selector = createOptimizedSelector(mockSearch, mockTransform)
+    const result = await selector(tasks)
+    expect(result).toEqual(['optimized: base skill'])
+  })
+
+  it('returns empty when search finds nothing', async () => {
+    const mockSearch = { search: async () => [] }
+    const mockTransform = { optimize: async (s: string) => s }
+    const selector = createOptimizedSelector(mockSearch, mockTransform)
+    const result = await selector(tasks)
+    expect(result).toEqual([])
+  })
+})
+
+describe('createHybridSelector (condition 8)', () => {
+  it('searches then evolves the top result', async () => {
+    const mockSearch = { search: async () => [{ content: 'base', score: 0.8 }] }
+    const evolve = async (s: string) => `evolved: ${s}`
+    const selector = createHybridSelector(mockSearch, evolve)
+    const result = await selector(tasks)
+    expect(result).toEqual(['evolved: base'])
+  })
+})
+
+describe('createEvoSkillEvolvedSelector (condition 2)', () => {
+  it('rejects path traversal', () => {
+    expect(() => createEvoSkillEvolvedSelector('/foo/../bar')).toThrow("must not contain '..'")
+  })
+})
+
+describe('CONDITIONS registry', () => {
+  it('has 9 conditions', () => {
+    expect(Object.keys(CONDITIONS)).toHaveLength(9)
+  })
+
+  it('maps numbers to names', () => {
+    expect(CONDITIONS[1]).toBe('baseline')
+    expect(CONDITIONS[7]).toBe('skillsmith-iterative')
+    expect(CONDITIONS[8]).toBe('hybrid')
+    expect(CONDITIONS[9]).toBe('skillsmith-curated')
+  })
+})
diff --git a/packages/core/tests/evaluation/FailureAnalyzer.test.ts b/packages/core/tests/evaluation/FailureAnalyzer.test.ts
new file mode 100644
index 00000000..cd5fa3ff
--- /dev/null
+++ b/packages/core/tests/evaluation/FailureAnalyzer.test.ts
@@ -0,0 +1,387 @@
+/**
+ * @fileoverview Tests for FailureAnalyzer (SMI-3295)
+ * @module @skillsmith/core/tests/evaluation/FailureAnalyzer
+ *
+ * Tests heuristic categorization with synthetic failures:
+ *  - Each category individually
+ *  - Frequency counting and example capping
+ *  - Edge cases (empty input, single failure, all same category)
+ *  - Hallucination false-positive guard (must not dominate mixed sets)
+ *  - suggestedFix template correctness
+ */
+
+import { describe, it, expect } from 'vitest'
+import { FailureAnalyzer } from '../../src/evaluation/FailureAnalyzer.js'
+import type { TaskFailure } from '../../src/evaluation/types.js'
+
+function makeFailure(overrides: Partial<TaskFailure> = {}): TaskFailure {
+  return {
+    taskId: 'task-1',
+    predicted: 'answer',
+    groundTruth: 'correct',
+    agentOutput: 'I answered: answer',
+    ...overrides,
+  }
+}
+
+describe('FailureAnalyzer — heuristic mode', () => {
+  const analyzer = new FailureAnalyzer({ mode: 'heuristic' })
+
+  // ==========================================================================
+  // Individual category detection
+  // ==========================================================================
+
+  describe('wrong_format detection', () => {
+    it('detects number vs string mismatch', () => {
+      const failure = makeFailure({
+        predicted: 'forty-two',
+        groundTruth: '42',
+        agentOutput: 'The answer is forty-two',
+      })
+
+      const patterns = analyzer.analyze([failure])
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].category).toBe('wrong_format')
+    })
+
+    it('detects list vs scalar mismatch', () => {
+      const failure = makeFailure({
+        predicted: 'Paris',
+        groundTruth: 'Paris, London, Berlin',
+        agentOutput: 'The answer is Paris',
+      })
+
+      const patterns = analyzer.analyze([failure])
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].category).toBe('wrong_format')
+    })
+
+    it('detects drastically different lengths', () => {
+      const failure = makeFailure({
+        predicted: 'A very long detailed response that goes on and on and on with many details',
+        groundTruth: 'Yes',
+        agentOutput: 'A very long detailed response that goes on and on and on with many details',
+      })
+
+      const patterns = analyzer.analyze([failure])
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].category).toBe('wrong_format')
+    })
+  })
+
+  describe('missing_context detection', () => {
+    it('detects "cannot determine" phrase', () => {
+      const failure = makeFailure({
+        agentOutput: 'I cannot determine the answer from the available information.',
+      })
+
+      const patterns = analyzer.analyze([failure])
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].category).toBe('missing_context')
+    })
+
+    it('detects "not provided" phrase', () => {
+      const failure = makeFailure({
+        agentOutput: 'The required data is not provided in the context.',
+      })
+
+      const patterns = analyzer.analyze([failure])
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].category).toBe('missing_context')
+    })
+
+    it('detects "I don\'t have enough information"', () => {
+      const failure = makeFailure({
+        agentOutput: "I don't have enough information to answer this question.",
+      })
+
+      const patterns = analyzer.analyze([failure])
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].category).toBe('missing_context')
+    })
+  })
+
+  describe('tool_misuse detection', () => {
+    it('detects failed tool call', () => {
+      const failure = makeFailure({
+        toolCallFailed: true,
+        agentOutput: 'I tried to search but the tool returned an error.',
+      })
+
+      const patterns = analyzer.analyze([failure])
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].category).toBe('tool_misuse')
+    })
+
+    it('detects zero tool calls when output references file/search', () => {
+      const failure = makeFailure({
+        toolCallCount: 0,
+        agentOutput: 'Looking at the file contents, I would say the answer is 42.',
+      })
+
+      const patterns = analyzer.analyze([failure])
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].category).toBe('tool_misuse')
+    })
+  })
+
+  describe('reasoning_error detection (fallback)', () => {
+    it('categorizes same-type wrong-value as reasoning error', () => {
+      const failure = makeFailure({
+        predicted: '37',
+        groundTruth: '42',
+        agentOutput: 'I think the answer is 37.',
+      })
+
+      const patterns = analyzer.analyze([failure])
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].category).toBe('reasoning_error')
+    })
+  })
+
+  describe('hallucination detection', () => {
+    it('detects confident wrong answer (no hedging)', () => {
+      const failure = makeFailure({
+        predicted: 'Paris',
+        groundTruth: 'Berlin',
+        agentOutput: 'The capital of Germany is Paris. This is a well-established fact.',
+      })
+
+      const patterns = analyzer.analyze([failure])
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].category).toBe('hallucination')
+    })
+
+    it('does not flag hedging answer as hallucination', () => {
+      const failure = makeFailure({
+        predicted: 'Paris',
+        groundTruth: 'Berlin',
+        agentOutput: 'I think the capital might be Paris, but it could also be Berlin.',
+      })
+
+      const patterns = analyzer.analyze([failure])
+      expect(patterns).toHaveLength(1)
+      // Should fall through to reasoning_error since hedging is present
+      expect(patterns[0].category).toBe('reasoning_error')
+    })
+
+    it('does not flag very short output as hallucination', () => {
+      const failure = makeFailure({
+        predicted: 'No',
+        groundTruth: 'Yes',
+        agentOutput: 'No.',
+      })
+
+      const patterns = analyzer.analyze([failure])
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].category).toBe('reasoning_error')
+    })
+  })
+
+  // ==========================================================================
+  // Frequency counting and ordering
+  // ==========================================================================
+
+  describe('frequency counting', () => {
+    it('counts frequencies and sorts descending', () => {
+      const failures: TaskFailure[] = [
+        // 3x wrong_format
+        makeFailure({ taskId: 'f1', predicted: 'word', groundTruth: '42', agentOutput: 'word' }),
+        makeFailure({ taskId: 'f2', predicted: 'text', groundTruth: '99', agentOutput: 'text' }),
+        makeFailure({ taskId: 'f3', predicted: 'abc', groundTruth: '7', agentOutput: 'abc' }),
+        // 2x missing_context
+        makeFailure({
+          taskId: 'f4',
+          agentOutput: 'I cannot determine the answer',
+        }),
+        makeFailure({
+          taskId: 'f5',
+          agentOutput: 'The data is not provided here',
+        }),
+        // 1x tool_misuse
+        makeFailure({
+          taskId: 'f6',
+          toolCallFailed: true,
+          agentOutput: 'Tool failed during execution',
+        }),
+      ]
+
+      const patterns = analyzer.analyze(failures)
+      expect(patterns.length).toBeGreaterThanOrEqual(3)
+      expect(patterns[0].category).toBe('wrong_format')
+      expect(patterns[0].frequency).toBe(3)
+      expect(patterns[1].category).toBe('missing_context')
+      expect(patterns[1].frequency).toBe(2)
+    })
+  })
+
+  // ==========================================================================
+  // Example capping
+  // ==========================================================================
+
+  describe('example capping', () => {
+    it('caps examples at 5 per category by default', () => {
+      const failures: TaskFailure[] = Array.from({ length: 10 }, (_, i) =>
+        makeFailure({
+          taskId: `task-${i}`,
+          predicted: 'text',
+          groundTruth: `${i}`,
+          agentOutput: `The answer is text-${i}`,
+        })
+      )
+
+      const patterns = analyzer.analyze(failures)
+      const formatPattern = patterns.find((p) => p.category === 'wrong_format')
+      expect(formatPattern).toBeDefined()
+      expect(formatPattern!.examples.length).toBeLessThanOrEqual(5)
+      expect(formatPattern!.frequency).toBe(10) // frequency counts all
+    })
+
+    it('respects custom maxExamplesPerCategory', () => {
+      const customAnalyzer = new FailureAnalyzer({
+        mode: 'heuristic',
+        maxExamplesPerCategory: 2,
+      })
+
+      const failures: TaskFailure[] = Array.from({ length: 5 }, (_, i) =>
+        makeFailure({
+          taskId: `task-${i}`,
+          predicted: 'text',
+          groundTruth: `${i}`,
+          agentOutput: `The answer is text-${i}`,
+        })
+      )
+
+      const patterns = customAnalyzer.analyze(failures)
+      const formatPattern = patterns.find((p) => p.category === 'wrong_format')
+      expect(formatPattern!.examples).toHaveLength(2)
+    })
+  })
+
+  // ==========================================================================
+  // suggestedFix templates
+  // ==========================================================================
+
+  describe('suggestedFix templates', () => {
+    it('provides correct template for each category', () => {
+      const failures: TaskFailure[] = [
+        // wrong_format
+        makeFailure({ taskId: 'f1', predicted: 'word', groundTruth: '42', agentOutput: 'word' }),
+        // missing_context
+        makeFailure({ taskId: 'f2', agentOutput: 'Cannot determine the answer' }),
+        // tool_misuse
+        makeFailure({ taskId: 'f3', toolCallFailed: true, agentOutput: 'Tool error occurred' }),
+      ]
+
+      const patterns = analyzer.analyze(failures)
+      const formatP = patterns.find((p) => p.category === 'wrong_format')
+      const contextP = patterns.find((p) => p.category === 'missing_context')
+      const toolP = patterns.find((p) => p.category === 'tool_misuse')
+
+      expect(formatP!.suggestedFix).toContain('output format instructions')
+      expect(contextP!.suggestedFix).toContain('context retrieval')
+      expect(toolP!.suggestedFix).toContain('tool usage guidance')
+    })
+  })
+
+  // ==========================================================================
+  // Edge cases
+  // ==========================================================================
+
+  describe('edge cases', () => {
+    it('returns empty array for no failures', () => {
+      expect(analyzer.analyze([])).toEqual([])
+    })
+
+    it('handles single failure', () => {
+      const patterns = analyzer.analyze([
+        makeFailure({
+          agentOutput: 'I cannot determine the answer from available data.',
+        }),
+      ])
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].frequency).toBe(1)
+    })
+
+    it('handles all failures in same category', () => {
+      const failures = Array.from({ length: 3 }, (_, i) =>
+        makeFailure({
+          taskId: `task-${i}`,
+          agentOutput: `I cannot determine answer ${i}`,
+        })
+      )
+
+      const patterns = analyzer.analyze(failures)
+      expect(patterns).toHaveLength(1)
+      expect(patterns[0].category).toBe('missing_context')
+      expect(patterns[0].frequency).toBe(3)
+    })
+  })
+
+  // ==========================================================================
+  // Hallucination false-positive guard
+  // ==========================================================================
+
+  describe('hallucination false-positive guard', () => {
+    it('hallucination does not dominate when clear format errors exist', () => {
+      const failures: TaskFailure[] = [
+        // 3x clear wrong_format
+        makeFailure({
+          taskId: 'f1',
+          predicted: 'word',
+          groundTruth: '42',
+          agentOutput: 'The answer is word',
+        }),
+        makeFailure({
+          taskId: 'f2',
+          predicted: 'text',
+          groundTruth: '99',
+          agentOutput: 'The answer is text',
+        }),
+        makeFailure({
+          taskId: 'f3',
+          predicted: 'abc',
+          groundTruth: '7',
+          agentOutput: 'The answer is abc',
+        }),
+        // 2x could be hallucination (confident + wrong, no format mismatch)
+        makeFailure({
+          taskId: 'f4',
+          predicted: '37',
+          groundTruth: '42',
+          agentOutput: 'The answer is definitely 37. This is a well-known fact.',
+        }),
+        makeFailure({
+          taskId: 'f5',
+          predicted: '99',
+          groundTruth: '100',
+          agentOutput: 'The answer is clearly 99. No doubt about it.',
+        }),
+      ]
+
+      const patterns = analyzer.analyze(failures)
+      // wrong_format (3) should be top category, not hallucination (2)
+      expect(patterns[0].category).toBe('wrong_format')
+      expect(patterns[0].frequency).toBe(3)
+
+      const hallucinationP = patterns.find((p) => p.category === 'hallucination')
+      if (hallucinationP) {
+        expect(hallucinationP.frequency).toBeLessThan(patterns[0].frequency)
+      }
+    })
+  })
+
+  // ==========================================================================
+  // LLM mode
+  // ==========================================================================
+
+  describe('LLM mode', () => {
+    it('is available as a configuration option', () => {
+      const llmAnalyzer = new FailureAnalyzer({ mode: 'llm' })
+      // LLM mode currently falls back to heuristic
+      const failures = [makeFailure({ agentOutput: 'Cannot determine the answer' })]
+      const patterns = llmAnalyzer.analyze(failures)
+      expect(patterns).toHaveLength(1)
+    })
+  })
+})
diff --git a/packages/core/tests/evaluation/IterativeEvaluator.test.ts b/packages/core/tests/evaluation/IterativeEvaluator.test.ts
new file mode 100644
index 00000000..a4b6db89
--- /dev/null
+++ b/packages/core/tests/evaluation/IterativeEvaluator.test.ts
@@ -0,0 +1,237 @@
+import { describe, it, expect, vi } from 'vitest'
+import { IterativeEvaluator } from '../../src/evaluation/IterativeEvaluator.js'
+import type { AgentRunner, EvalTask } from '../../src/evaluation/IterativeEvaluator.js'
+
+function makeTasks(count: number, split: string): EvalTask[] {
+  return Array.from({ length: count }, (_, i) => ({
+    id: `${split}-${i}`,
+    question: `Question ${i}?`,
+    groundTruth: `answer${i}`,
+  }))
+}
+
+function createMockRunner(correctRate = 0.5): AgentRunner {
+  let callCount = 0
+  return {
+    run: vi.fn().mockImplementation(async ({ question }: { question: string }) => {
+      callCount++
+      const idx = parseInt(question.match(/\d+/)?.[0] ?? '0', 10)
+      // Return correct answer for the first `correctRate` fraction of tasks
+      const isCorrect = idx < Math.ceil(10 * correctRate)
+      return {
+        predicted: isCorrect ? `answer${idx}` : 'wrong',
+        agentOutput: isCorrect ? `The answer is answer${idx}` : 'I think the answer is wrong',
+        costTokens: 100,
+        toolCallFailed: false,
+        toolCallCount: 1,
+      }
+    }),
+  }
+}
+
+const trainTasks = makeTasks(10, 'train')
+const valTasks = makeTasks(10, 'val')
+const testTasks = makeTasks(5, 'test')
+
+const BASELINE_SKILL = '# Test Skill\n\n## Instructions\n\nDo something useful.\n'
+
+describe('IterativeEvaluator', () => {
+  describe('pre-loop baseline evaluation', () => {
+    it('seeds frontier with baseline accuracy (not 0)', async () => {
+      const runner = createMockRunner(0.6)
+      const evaluator = new IterativeEvaluator({
+        maxIterations: 1,
+        frontierSize: 3,
+        earlyStoppingPatience: 3,
+        costBudget: 100_000,
+        scorer: (_q, predicted, gt) => (predicted === gt ? 1.0 : 0.0),
+        agentRunner: runner,
+        generationStrategies: ['augment'],
+      })
+
+      const result = await evaluator.run(
+        BASELINE_SKILL,
+        'test-skill',
+        trainTasks,
+        valTasks,
+        testTasks
+      )
+
+      // Convergence curve starts at iteration 0 with real accuracy
+      expect(result.convergenceCurve[0].iteration).toBe(0)
+      expect(result.convergenceCurve[0].bestAccuracy).toBeGreaterThan(0)
+    })
+  })
+
+  describe('early stopping', () => {
+    it('stops after patience iterations without improvement', async () => {
+      // Runner always returns same answers → accuracy never improves
+      const runner = createMockRunner(0.5)
+      const evaluator = new IterativeEvaluator({
+        maxIterations: 20,
+        frontierSize: 3,
+        earlyStoppingPatience: 3,
+        costBudget: 1_000_000,
+        scorer: (_q, predicted, gt) => (predicted === gt ? 1.0 : 0.0),
+        agentRunner: runner,
+        generationStrategies: ['augment'],
+      })
+
+      const result = await evaluator.run(
+        BASELINE_SKILL,
+        'test-skill',
+        trainTasks,
+        valTasks,
+        testTasks
+      )
+
+      expect(result.totalIterations).toBeLessThanOrEqual(20)
+      expect(result.earlyStopReason).toContain('no improvement')
+    })
+  })
+
+  describe('cost budget enforcement', () => {
+    it('stops when budget is exhausted', async () => {
+      const runner = createMockRunner(0.5)
+      const evaluator = new IterativeEvaluator({
+        maxIterations: 100,
+        frontierSize: 3,
+        earlyStoppingPatience: 100,
+        costBudget: 500, // Very tight budget — each task costs 100 tokens
+        scorer: (_q, predicted, gt) => (predicted === gt ? 1.0 : 0.0),
+        agentRunner: runner,
+        generationStrategies: ['augment'],
+      })
+
+      const result = await evaluator.run(
+        BASELINE_SKILL,
+        'test-skill',
+        trainTasks,
+        valTasks,
+        testTasks
+      )
+
+      expect(result.earlyStopReason).toContain('budget exhausted')
+      expect(result.totalCost).toBeGreaterThanOrEqual(500)
+    })
+  })
+
+  describe('convergence curve', () => {
+    it('records a snapshot per iteration', async () => {
+      const runner = createMockRunner(0.5)
+      const evaluator = new IterativeEvaluator({
+        maxIterations: 2,
+        frontierSize: 3,
+        earlyStoppingPatience: 10,
+        costBudget: 1_000_000,
+        scorer: (_q, predicted, gt) => (predicted === gt ? 1.0 : 0.0),
+        agentRunner: runner,
+        generationStrategies: ['augment'],
+      })
+
+      const result = await evaluator.run(
+        BASELINE_SKILL,
+        'test-skill',
+        trainTasks,
+        valTasks,
+        testTasks
+      )
+
+      // iteration 0 (baseline) + up to 2 iterations
+      expect(result.convergenceCurve.length).toBeGreaterThanOrEqual(2)
+      // Each snapshot has required fields
+      for (const snap of result.convergenceCurve) {
+        expect(snap).toHaveProperty('iteration')
+        expect(snap).toHaveProperty('bestAccuracy')
+        expect(snap).toHaveProperty('cost')
+      }
+    })
+  })
+
+  describe('frontier updates', () => {
+    it('returns final frontier with scored variants', async () => {
+      const runner = createMockRunner(0.5)
+      const evaluator = new IterativeEvaluator({
+        maxIterations: 1,
+        frontierSize: 3,
+        earlyStoppingPatience: 10,
+        costBudget: 1_000_000,
+        scorer: (_q, predicted, gt) => (predicted === gt ? 1.0 : 0.0),
+        agentRunner: runner,
+        generationStrategies: ['augment'],
+      })
+
+      const result = await evaluator.run(
+        BASELINE_SKILL,
+        'test-skill',
+        trainTasks,
+        valTasks,
+        testTasks
+      )
+
+      expect(result.finalFrontier.length).toBeGreaterThanOrEqual(1)
+      for (const scored of result.finalFrontier) {
+        expect(scored.variant).toBeDefined()
+        expect(scored.accuracy).toBeGreaterThanOrEqual(0)
+        expect(scored.accuracy).toBeLessThanOrEqual(1)
+      }
+    })
+  })
+
+  describe('test split isolation', () => {
+    it('evaluates test split only at the end', async () => {
+      const runner = createMockRunner(0.5)
+      const evaluator = new IterativeEvaluator({
+        maxIterations: 1,
+        frontierSize: 3,
+        earlyStoppingPatience: 10,
+        costBudget: 1_000_000,
+        scorer: (_q, predicted, gt) => (predicted === gt ? 1.0 : 0.0),
+        agentRunner: runner,
+        generationStrategies: ['augment'],
+      })
+
+      const result = await evaluator.run(
+        BASELINE_SKILL,
+        'test-skill',
+        trainTasks,
+        valTasks,
+        testTasks
+      )
+
+      expect(result.testAccuracy).toBeDefined()
+      expect(result.testAccuracy).toBeGreaterThanOrEqual(0)
+      expect(result.testAccuracy).toBeLessThanOrEqual(1)
+    })
+  })
+
+  describe('log format', () => {
+    it('logs in expected format', async () => {
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {})
+      const runner = createMockRunner(0.5)
+      const evaluator = new IterativeEvaluator({
+        maxIterations: 1,
+        frontierSize: 3,
+        earlyStoppingPatience: 10,
+        costBudget: 1_000_000,
+        scorer: (_q, predicted, gt) => (predicted === gt ? 1.0 : 0.0),
+        agentRunner: runner,
+        generationStrategies: ['augment'],
+      })
+
+      await evaluator.run(BASELINE_SKILL, 'test-skill', trainTasks, valTasks, testTasks)
+
+      const logCalls = consoleSpy.mock.calls.flat()
+      const iterLog = logCalls.find(
+        (msg) => typeof msg === 'string' && msg.includes('[IterativeEvaluator]')
+      )
+      expect(iterLog).toBeDefined()
+      expect(iterLog).toContain('[iteration=')
+      expect(iterLog).toContain('[best_accuracy=')
+      expect(iterLog).toContain('[frontier_size=')
+      expect(iterLog).toContain('[cost=')
+
+      consoleSpy.mockRestore()
+    })
+  })
+})
diff --git a/packages/core/tests/evaluation/SkillVariantGenerator.test.ts b/packages/core/tests/evaluation/SkillVariantGenerator.test.ts
new file mode 100644
index 00000000..0690d8d7
--- /dev/null
+++ b/packages/core/tests/evaluation/SkillVariantGenerator.test.ts
@@ -0,0 +1,265 @@
+import { describe, it, expect, vi } from 'vitest'
+import { SkillVariantGenerator } from '../../src/evaluation/SkillVariantGenerator.js'
+import type { RewriteClient } from '../../src/evaluation/SkillVariantGenerator.js'
+import type { FailurePattern } from '../../src/evaluation/types.js'
+
+const BASIC_SKILL = `# Test Skill
+
+## Instructions
+
+Do something useful.
+
+## Examples
+
+Example 1: hello world
+`
+
+function makeFailurePatterns(count = 3): FailurePattern[] {
+  const categories = ['wrong_format', 'missing_context', 'reasoning_error'] as const
+  return categories.slice(0, count).map((cat, i) => ({
+    category: cat,
+    frequency: 10 - i * 3,
+    examples: [],
+    suggestedFix: `Fix for ${cat}`,
+  }))
+}
+
+function makeLargeSkill(lineCount: number): string {
+  const lines = ['# Large Skill', '']
+  for (let i = 0; i < 5; i++) {
+    lines.push(`## Section ${i + 1}`, '')
+    const sectionSize = Math.floor((lineCount - 12) / 5)
+    for (let j = 0; j < sectionSize; j++) {
+      lines.push(`Line ${j} of section ${i + 1}`)
+    }
+    lines.push('')
+  }
+  return lines.join('\n')
+}
+
+describe('SkillVariantGenerator', () => {
+  const baseParams = {
+    skillId: 'test-skill',
+    content: BASIC_SKILL,
+    parentId: null,
+    iteration: 1,
+    failurePatterns: makeFailurePatterns(),
+  }
+
+  describe('augment strategy', () => {
+    it('appends failure fixes under ## Skill Improvement Notes', async () => {
+      const gen = new SkillVariantGenerator({ strategies: ['augment'] })
+      const variants = await gen.generate(baseParams)
+      expect(variants).toHaveLength(1)
+      expect(variants[0].generationMethod).toBe('augment')
+      expect(variants[0].content).toContain('## Skill Improvement Notes')
+      expect(variants[0].content).toContain('wrong_format')
+      expect(variants[0].content).toContain('missing_context')
+      expect(variants[0].content).toContain('reasoning_error')
+    })
+
+    it('replaces existing ## Skill Improvement Notes section', async () => {
+      const existingContent = `${BASIC_SKILL}\n## Skill Improvement Notes\n\n- old fix\n`
+      const gen = new SkillVariantGenerator({ strategies: ['augment'] })
+      const variants = await gen.generate({
+        ...baseParams,
+        content: existingContent,
+      })
+      expect(variants).toHaveLength(1)
+      // Should not have double sections
+      const matches = variants[0].content.match(/## Skill Improvement Notes/g)
+      expect(matches).toHaveLength(1)
+      // Should have new fixes, not old
+      expect(variants[0].content).not.toContain('old fix')
+      expect(variants[0].content).toContain('wrong_format')
+    })
+
+    it('returns nothing when no failure patterns', async () => {
+      const gen = new SkillVariantGenerator({ strategies: ['augment'] })
+      const variants = await gen.generate({
+        ...baseParams,
+        failurePatterns: [],
+      })
+      expect(variants).toHaveLength(0)
+    })
+  })
+
+  describe('decompose strategy', () => {
+    it('is skipped for skills <=200 lines', async () => {
+      const gen = new SkillVariantGenerator({ strategies: ['decompose'] })
+      const variants = await gen.generate(baseParams) // BASIC_SKILL is ~10 lines
+      expect(variants).toHaveLength(0)
+    })
+
+    it('produces a focused variant for skills >200 lines', async () => {
+      const gen = new SkillVariantGenerator({ strategies: ['decompose'] })
+      const largeContent = makeLargeSkill(250)
+      const variants = await gen.generate({
+        ...baseParams,
+        content: largeContent,
+      })
+      expect(variants.length).toBeGreaterThanOrEqual(1)
+      const variant = variants[0]
+      expect(variant.generationMethod).toBe('decompose')
+      expect(variant.content.split('\n').length).toBeLessThan(largeContent.split('\n').length)
+    })
+  })
+
+  describe('specialize strategy', () => {
+    it('returns null for general domain', async () => {
+      const gen = new SkillVariantGenerator({
+        strategies: ['specialize'],
+        benchmarkDomain: 'general',
+      })
+      const variants = await gen.generate(baseParams)
+      expect(variants).toHaveLength(0)
+    })
+
+    it('strips irrelevant sections for a specific domain', async () => {
+      const content = [
+        '# Multi-Domain Skill',
+        '',
+        '## Finance Section',
+        '',
+        'This section covers finance and accounting.',
+        '',
+        '## Cooking Section',
+        '',
+        'This section covers cooking recipes.',
+        '',
+        '## Finance Analysis',
+        '',
+        'More finance content here.',
+      ].join('\n')
+
+      const gen = new SkillVariantGenerator({
+        strategies: ['specialize'],
+        benchmarkDomain: 'finance',
+      })
+      const variants = await gen.generate({
+        ...baseParams,
+        content,
+      })
+      expect(variants).toHaveLength(1)
+      expect(variants[0].content).toContain('Finance Section')
+      expect(variants[0].content).toContain('Finance Analysis')
+      expect(variants[0].content).not.toContain('Cooking Section')
+    })
+  })
+
+  describe('llm_rewrite strategy', () => {
+    it('returns null when no rewriteClient is provided', async () => {
+      const gen = new SkillVariantGenerator({ strategies: ['llm_rewrite'] })
+      const variants = await gen.generate(baseParams)
+      expect(variants).toHaveLength(0)
+    })
+
+    it('produces a variant using the rewrite client', async () => {
+      const mockClient: RewriteClient = {
+        rewrite: vi.fn().mockResolvedValue('# Rewritten Skill\n\nImproved content here.'),
+      }
+      const gen = new SkillVariantGenerator({
+        strategies: ['llm_rewrite'],
+        rewriteClient: mockClient,
+        benchmarkDomain: 'qa',
+      })
+      const variants = await gen.generate(baseParams)
+      expect(variants).toHaveLength(1)
+      expect(variants[0].generationMethod).toBe('llm_rewrite')
+      expect(variants[0].content).toContain('Rewritten Skill')
+      expect(mockClient.rewrite).toHaveBeenCalledWith({
+        model: 'claude-sonnet-4-6',
+        skillContent: BASIC_SKILL,
+        failurePatterns: baseParams.failurePatterns,
+        benchmarkDomain: 'qa',
+      })
+    })
+
+    it('skips variant when rewrite returns identical content', async () => {
+      const mockClient: RewriteClient = {
+        rewrite: vi.fn().mockResolvedValue(BASIC_SKILL),
+      }
+      const gen = new SkillVariantGenerator({
+        strategies: ['llm_rewrite'],
+        rewriteClient: mockClient,
+      })
+      const variants = await gen.generate(baseParams)
+      expect(variants).toHaveLength(0)
+    })
+  })
+
+  describe('deduplication', () => {
+    it('deduplicates identical content from different frontier members', async () => {
+      const gen = new SkillVariantGenerator({ strategies: ['augment'] })
+
+      const variants1 = await gen.generate(baseParams)
+      expect(variants1).toHaveLength(1)
+
+      // Same content + same patterns → same output → deduplicated
+      const variants2 = await gen.generate({
+        ...baseParams,
+        parentId: 'different-parent',
+      })
+      expect(variants2).toHaveLength(0)
+    })
+
+    it('resets dedup between runs', async () => {
+      const gen = new SkillVariantGenerator({ strategies: ['augment'] })
+
+      const variants1 = await gen.generate(baseParams)
+      expect(variants1).toHaveLength(1)
+
+      gen.resetDedup()
+
+      const variants2 = await gen.generate(baseParams)
+      expect(variants2).toHaveLength(1)
+    })
+  })
+
+  describe('variant metadata', () => {
+    it('sets contentLines and costTokens', async () => {
+      const gen = new SkillVariantGenerator({ strategies: ['augment'] })
+      const variants = await gen.generate(baseParams)
+      expect(variants[0].contentLines).toBeGreaterThan(0)
+      expect(variants[0].costTokens).toBe(0) // augment is free
+    })
+
+    it('sets non-zero costTokens for llm_rewrite', async () => {
+      const mockClient: RewriteClient = {
+        rewrite: vi.fn().mockResolvedValue('# Rewritten\n\nNew content.'),
+      }
+      const gen = new SkillVariantGenerator({
+        strategies: ['llm_rewrite'],
+        rewriteClient: mockClient,
+      })
+      const variants = await gen.generate(baseParams)
+      expect(variants[0].costTokens).toBeGreaterThan(0)
+    })
+
+    it('generates unique IDs per variant', async () => {
+      const gen = new SkillVariantGenerator({
+        strategies: ['augment', 'specialize'],
+        benchmarkDomain: 'finance',
+      })
+      const content = [
+        '# Skill',
+        '',
+        '## Finance',
+        '',
+        'Finance content.',
+        '',
+        '## Cooking',
+        '',
+        'Cooking content.',
+      ].join('\n')
+      const variants = await gen.generate({
+        ...baseParams,
+        content,
+      })
+      if (variants.length >= 2) {
+        expect(variants[0].id).not.toBe(variants[1].id)
+        expect(variants[0].contentHash).not.toBe(variants[1].contentHash)
+      }
+    })
+  })
+})
diff --git a/packages/core/tests/evaluation/VariantSelector.test.ts b/packages/core/tests/evaluation/VariantSelector.test.ts
new file mode 100644
index 00000000..9317f676
--- /dev/null
+++ b/packages/core/tests/evaluation/VariantSelector.test.ts
@@ -0,0 +1,143 @@
+import { describe, it, expect } from 'vitest'
+import { VariantSelector } from '../../src/evaluation/VariantSelector.js'
+import type { ScoredVariant, SkillVariant } from '../../src/evaluation/types.js'
+
+function makeScoredVariant(
+  overrides: Partial<ScoredVariant> & { accuracy: number; cost: number }
+): ScoredVariant {
+  const variant: SkillVariant = {
+    id: `v-${Math.random().toString(36).slice(2, 8)}`,
+    contentHash: `hash-${Math.random().toString(36).slice(2, 8)}`,
+    content: '# Test Skill',
+    parentId: null,
+    skillId: 'test-skill',
+    iteration: 1,
+    generationMethod: 'augment',
+    ...overrides.variant,
+  }
+  return {
+    variant,
+    accuracy: overrides.accuracy,
+    cost: overrides.cost,
+    skillSize: overrides.skillSize ?? 50,
+  }
+}
+
+describe('VariantSelector', () => {
+  const selector = new VariantSelector()
+
+  describe('Pareto dominance', () => {
+    it('keeps non-dominated variants', () => {
+      const candidates = [
+        makeScoredVariant({ accuracy: 0.9, cost: 100 }), // A: high acc, high cost
+        makeScoredVariant({ accuracy: 0.7, cost: 50 }), // B: med acc, low cost
+        makeScoredVariant({ accuracy: 0.6, cost: 200 }), // C: dominated by both A and B
+      ]
+
+      const result = selector.select(candidates, 10)
+      expect(result).toHaveLength(2)
+
+      const methods = result.map((r) => r.accuracy)
+      expect(methods).toContain(0.9)
+      expect(methods).toContain(0.7)
+    })
+
+    it('removes strictly dominated variants', () => {
+      const candidates = [
+        makeScoredVariant({ accuracy: 0.8, cost: 100 }),
+        makeScoredVariant({ accuracy: 0.7, cost: 150 }), // dominated: worse acc AND worse cost
+      ]
+
+      const result = selector.select(candidates, 10)
+      expect(result).toHaveLength(1)
+      expect(result[0].accuracy).toBe(0.8)
+    })
+
+    it('keeps both when neither dominates', () => {
+      const candidates = [
+        makeScoredVariant({ accuracy: 0.9, cost: 200 }),
+        makeScoredVariant({ accuracy: 0.7, cost: 50 }),
+      ]
+
+      const result = selector.select(candidates, 10)
+      expect(result).toHaveLength(2)
+    })
+
+    it('keeps equal variants (neither dominates the other)', () => {
+      const candidates = [
+        makeScoredVariant({ accuracy: 0.8, cost: 100 }),
+        makeScoredVariant({ accuracy: 0.8, cost: 100 }),
+      ]
+
+      const result = selector.select(candidates, 10)
+      expect(result).toHaveLength(2)
+    })
+  })
+
+  describe('frontier size enforcement', () => {
+    it('limits result to frontierSize', () => {
+      const candidates = [
+        makeScoredVariant({ accuracy: 0.9, cost: 300 }),
+        makeScoredVariant({ accuracy: 0.8, cost: 200 }),
+        makeScoredVariant({ accuracy: 0.7, cost: 100 }),
+        makeScoredVariant({ accuracy: 0.6, cost: 50 }),
+      ]
+
+      const result = selector.select(candidates, 2)
+      expect(result.length).toBeLessThanOrEqual(2)
+    })
+
+    it('returns all when candidates <= frontierSize', () => {
+      const candidates = [
+        makeScoredVariant({ accuracy: 0.9, cost: 100 }),
+        makeScoredVariant({ accuracy: 0.7, cost: 50 }),
+      ]
+
+      const result = selector.select(candidates, 5)
+      expect(result).toHaveLength(2)
+    })
+  })
+
+  describe('tiebreaker on skillSize', () => {
+    it('prefers smaller skillSize when accuracy is equal', () => {
+      // Each trades accuracy for cost → all non-dominated
+      // B and C tie on accuracy and cost → neither dominates the other
+      const candidates = [
+        makeScoredVariant({ accuracy: 0.9, cost: 300, skillSize: 200 }), // A
+        makeScoredVariant({ accuracy: 0.8, cost: 100, skillSize: 50 }), // B
+        makeScoredVariant({ accuracy: 0.8, cost: 100, skillSize: 150 }), // C
+        makeScoredVariant({ accuracy: 0.7, cost: 50, skillSize: 100 }), // D
+      ]
+
+      // All 4 non-dominated. Limit to 2 → sort by accuracy desc, tiebreak skillSize asc.
+      // A (0.9) first. B vs C (both 0.8): B has skillSize 50 < C's 150 → B wins.
+      const result = selector.select(candidates, 2)
+      expect(result).toHaveLength(2)
+      expect(result[0].accuracy).toBe(0.9)
+      expect(result[1].accuracy).toBe(0.8)
+      expect(result[1].skillSize).toBe(50)
+    })
+  })
+
+  describe('edge cases', () => {
+    it('returns empty for empty input', () => {
+      const result = selector.select([], 5)
+      expect(result).toHaveLength(0)
+    })
+
+    it('handles single candidate', () => {
+      const candidates = [makeScoredVariant({ accuracy: 0.5, cost: 100 })]
+      const result = selector.select(candidates, 3)
+      expect(result).toHaveLength(1)
+    })
+
+    it('handles all identical candidates', () => {
+      const candidates = Array.from({ length: 5 }, () =>
+        makeScoredVariant({ accuracy: 0.8, cost: 100, skillSize: 50 })
+      )
+      const result = selector.select(candidates, 3)
+      // None dominate each other since all are equal
+      expect(result.length).toBeLessThanOrEqual(3)
+    })
+  })
+})
diff --git a/packages/core/tests/repositories/BenchmarkRepository.test.ts b/packages/core/tests/repositories/BenchmarkRepository.test.ts
new file mode 100644
index 00000000..044d6f56
--- /dev/null
+++ b/packages/core/tests/repositories/BenchmarkRepository.test.ts
@@ -0,0 +1,453 @@
+/**
+ * @fileoverview Tests for BenchmarkRepository (SMI-3292)
+ * @module @skillsmith/core/tests/repositories/BenchmarkRepository
+ *
+ * Tests CRUD operations for benchmark_results, skill_variants,
+ * and failure_patterns tables. Uses createTestDatabase() which
+ * runs all migrations including v11.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest'
+import { createTestDatabase, closeDatabase } from '../helpers/database.js'
+import type { Database } from '../../src/db/database-interface.js'
+import { BenchmarkRepository } from '../../src/repositories/BenchmarkRepository.js'
+
+let db: Database
+let repo: BenchmarkRepository
+
+beforeEach(() => {
+  db = createTestDatabase()
+  repo = new BenchmarkRepository(db)
+
+  // Insert a fixture skill for FK references
+  db.exec(`
+    INSERT INTO skills (id, name, author, description)
+    VALUES ('skill-1', 'test-skill', 'test-author', 'A test skill');
+  `)
+})
+
+afterEach(() => {
+  closeDatabase(db)
+})
+
+// ============================================================================
+// benchmark_results
+// ============================================================================
+
+describe('BenchmarkRepository — benchmark_results', () => {
+  const baseResult = {
+    id: 'br-1',
+    skillId: 'skill-1',
+    skillVariantHash: 'abc123',
+    benchmark: 'officeqa' as const,
+    split: 'val' as const,
+    condition: 'skillsmith-search',
+    accuracy: 0.75,
+    taskCount: 100,
+    correctCount: 75,
+    scorer: 'exact_match' as const,
+    modelId: 'claude-sonnet-4-6',
+    seed: 42,
+  }
+
+  it('inserts and retrieves a result', () => {
+    repo.insertResult(baseResult)
+    const row = repo.getResult('br-1')
+
+    expect(row).toBeDefined()
+    expect(row!.skill_id).toBe('skill-1')
+    expect(row!.accuracy).toBe(0.75)
+    expect(row!.task_count).toBe(100)
+    expect(row!.correct_count).toBe(75)
+    expect(row!.iteration).toBe(0)
+  })
+
+  it('enforces correct_count <= task_count at DB layer', () => {
+    expect(() =>
+      repo.insertResult({
+        ...baseResult,
+        id: 'br-bad',
+        correctCount: 101, // exceeds taskCount of 100
+      })
+    ).toThrow()
+  })
+
+  it('enforces accuracy range 0-1', () => {
+    expect(() =>
+      repo.insertResult({
+        ...baseResult,
+        id: 'br-bad',
+        accuracy: 1.5,
+      })
+    ).toThrow()
+  })
+
+  it('enforces valid benchmark values', () => {
+    expect(() =>
+      repo.insertResult({
+        ...baseResult,
+        id: 'br-bad',
+        benchmark: 'invalid' as 'officeqa',
+      })
+    ).toThrow()
+  })
+
+  it('queries results by skill', () => {
+    repo.insertResult(baseResult)
+    repo.insertResult({
+      ...baseResult,
+      id: 'br-2',
+      benchmark: 'sealqa',
+      split: 'test',
+    })
+
+    const all = repo.getResultsBySkill('skill-1')
+    expect(all).toHaveLength(2)
+
+    const filtered = repo.getResultsBySkill('skill-1', 'officeqa', 'val')
+    expect(filtered).toHaveLength(1)
+    expect(filtered[0].id).toBe('br-1')
+  })
+
+  it('queries results by condition', () => {
+    repo.insertResult(baseResult)
+    repo.insertResult({
+      ...baseResult,
+      id: 'br-2',
+      condition: 'skillsmith-search',
+      iteration: 1,
+      seed: 43,
+    })
+
+    const results = repo.getResultsByCondition('skillsmith-search', 'officeqa')
+    expect(results).toHaveLength(2)
+    // Ordered by iteration ASC
+    expect(results[0].iteration).toBe(0)
+    expect(results[1].iteration).toBe(1)
+  })
+
+  it('deletes a result', () => {
+    repo.insertResult(baseResult)
+    expect(repo.deleteResult('br-1')).toBe(true)
+    expect(repo.getResult('br-1')).toBeUndefined()
+    expect(repo.deleteResult('nonexistent')).toBe(false)
+  })
+
+  it('stores optional cost fields', () => {
+    repo.insertResult({
+      ...baseResult,
+      costTokens: 50000,
+      costDollars: 0.25,
+      wallClockMs: 12000,
+    })
+    const row = repo.getResult('br-1')!
+    expect(row.cost_tokens).toBe(50000)
+    expect(row.cost_dollars).toBe(0.25)
+    expect(row.wall_clock_ms).toBe(12000)
+  })
+})
+
+// ============================================================================
+// skill_variants
+// ============================================================================
+
+describe('BenchmarkRepository — skill_variants', () => {
+  const baseVariant = {
+    id: 'sv-1',
+    skillId: 'skill-1',
+    contentHash: 'hash-abc',
+    iteration: 0,
+    generationMethod: 'baseline' as const,
+  }
+
+  it('inserts and retrieves a variant', () => {
+    repo.insertVariant(baseVariant)
+    const row = repo.getVariant('sv-1')
+
+    expect(row).toBeDefined()
+    expect(row!.skill_id).toBe('skill-1')
+    expect(row!.content_hash).toBe('hash-abc')
+    expect(row!.is_frontier).toBe(0)
+    expect(row!.parent_variant_id).toBeNull()
+  })
+
+  it('enforces UNIQUE(skill_id, content_hash)', () => {
+    repo.insertVariant(baseVariant)
+    expect(() =>
+      repo.insertVariant({
+        ...baseVariant,
+        id: 'sv-2', // different UUID
+        // same skill_id + content_hash → should fail
+      })
+    ).toThrow()
+  })
+
+  it('enforces is_frontier IN (0, 1)', () => {
+    expect(() =>
+      repo.insertVariant({
+        ...baseVariant,
+        id: 'sv-bad',
+      })
+    ).not.toThrow()
+
+    // Direct SQL to test constraint bypass
+    expect(() =>
+      db.exec(`
+        INSERT INTO skill_variants
+          (id, skill_id, content_hash, iteration, generation_method, is_frontier)
+        VALUES ('sv-bad2', 'skill-1', 'hash-bad2', 0, 'baseline', 2)
+      `)
+    ).toThrow()
+  })
+
+  it('enforces valid generation_method values', () => {
+    expect(() =>
+      repo.insertVariant({
+        ...baseVariant,
+        id: 'sv-bad',
+        contentHash: 'hash-bad',
+        generationMethod: 'invalid' as 'baseline',
+      })
+    ).toThrow()
+  })
+
+  it('looks up by content hash', () => {
+    repo.insertVariant(baseVariant)
+    const row = repo.getVariantByHash('skill-1', 'hash-abc')
+    expect(row).toBeDefined()
+    expect(row!.id).toBe('sv-1')
+
+    expect(repo.getVariantByHash('skill-1', 'nonexistent')).toBeUndefined()
+  })
+
+  it('manages frontier membership', () => {
+    repo.insertVariant({ ...baseVariant, isFrontier: true })
+    repo.insertVariant({
+      ...baseVariant,
+      id: 'sv-2',
+      contentHash: 'hash-def',
+      iteration: 1,
+      generationMethod: 'augment',
+      isFrontier: true,
+    })
+
+    const frontier = repo.getFrontierVariants('skill-1')
+    expect(frontier).toHaveLength(2)
+
+    repo.clearFrontier('skill-1')
+    expect(repo.getFrontierVariants('skill-1')).toHaveLength(0)
+  })
+
+  it('updates accuracy values', () => {
+    repo.insertVariant(baseVariant)
+    repo.updateVariantAccuracy('sv-1', 0.6, 0.65, null)
+
+    const row = repo.getVariant('sv-1')!
+    expect(row.accuracy_train).toBe(0.6)
+    expect(row.accuracy_val).toBe(0.65)
+    expect(row.accuracy_test).toBeNull()
+  })
+
+  it('sets frontier on individual variant', () => {
+    repo.insertVariant(baseVariant)
+    expect(repo.getVariant('sv-1')!.is_frontier).toBe(0)
+
+    repo.setFrontier('sv-1', true)
+    expect(repo.getVariant('sv-1')!.is_frontier).toBe(1)
+
+    repo.setFrontier('sv-1', false)
+    expect(repo.getVariant('sv-1')!.is_frontier).toBe(0)
+  })
+
+  it('tracks parent lineage', () => {
+    repo.insertVariant(baseVariant)
+    repo.insertVariant({
+      id: 'sv-child',
+      skillId: 'skill-1',
+      parentVariantId: 'sv-1',
+      contentHash: 'hash-child',
+      iteration: 1,
+      generationMethod: 'augment',
+    })
+
+    const child = repo.getVariant('sv-child')!
+    expect(child.parent_variant_id).toBe('sv-1')
+  })
+
+  it('deletes a variant', () => {
+    repo.insertVariant(baseVariant)
+    expect(repo.deleteVariant('sv-1')).toBe(true)
+    expect(repo.getVariant('sv-1')).toBeUndefined()
+  })
+})
+
+// ============================================================================
+// failure_patterns
+// ============================================================================
+
+describe('BenchmarkRepository — failure_patterns', () => {
+  const resultId = 'br-fp'
+
+  beforeEach(() => {
+    // Insert a benchmark result first for FK
+    repo.insertResult({
+      id: resultId,
+      skillId: 'skill-1',
+      skillVariantHash: 'hash-fp',
+      benchmark: 'officeqa',
+      split: 'val',
+      condition: 'test-cond',
+      accuracy: 0.5,
+      taskCount: 10,
+      correctCount: 5,
+      scorer: 'exact_match',
+      modelId: 'claude-sonnet-4-6',
+      seed: 1,
+    })
+  })
+
+  it('inserts and retrieves a pattern', () => {
+    repo.insertPattern({
+      id: 'fp-1',
+      benchmarkResultId: resultId,
+      category: 'wrong_format',
+      frequency: 3,
+      exampleTasks: ['task-1', 'task-2'],
+      suggestedFix: 'Add format instructions',
+    })
+
+    const row = repo.getPattern('fp-1')
+    expect(row).toBeDefined()
+    expect(row!.category).toBe('wrong_format')
+    expect(row!.frequency).toBe(3)
+    expect(JSON.parse(row!.example_tasks!)).toEqual(['task-1', 'task-2'])
+    expect(row!.suggested_fix).toBe('Add format instructions')
+  })
+
+  it('enforces valid category values', () => {
+    expect(() =>
+      repo.insertPattern({
+        id: 'fp-bad',
+        benchmarkResultId: resultId,
+        category: 'invalid' as 'wrong_format',
+        frequency: 1,
+      })
+    ).toThrow()
+  })
+
+  it('queries patterns by result, ordered by frequency DESC', () => {
+    repo.insertPattern({
+      id: 'fp-1',
+      benchmarkResultId: resultId,
+      category: 'wrong_format',
+      frequency: 3,
+    })
+    repo.insertPattern({
+      id: 'fp-2',
+      benchmarkResultId: resultId,
+      category: 'reasoning_error',
+      frequency: 5,
+    })
+    repo.insertPattern({
+      id: 'fp-3',
+      benchmarkResultId: resultId,
+      category: 'tool_misuse',
+      frequency: 1,
+    })
+
+    const patterns = repo.getPatternsByResult(resultId)
+    expect(patterns).toHaveLength(3)
+    expect(patterns[0].frequency).toBe(5) // reasoning_error first
+    expect(patterns[1].frequency).toBe(3)
+    expect(patterns[2].frequency).toBe(1)
+  })
+
+  it('handles null example_tasks and suggested_fix', () => {
+    repo.insertPattern({
+      id: 'fp-null',
+      benchmarkResultId: resultId,
+      category: 'hallucination',
+      frequency: 2,
+    })
+
+    const row = repo.getPattern('fp-null')!
+    expect(row.example_tasks).toBeNull()
+    expect(row.suggested_fix).toBeNull()
+  })
+
+  it('deletes patterns by result', () => {
+    repo.insertPattern({
+      id: 'fp-1',
+      benchmarkResultId: resultId,
+      category: 'wrong_format',
+      frequency: 1,
+    })
+    repo.insertPattern({
+      id: 'fp-2',
+      benchmarkResultId: resultId,
+      category: 'reasoning_error',
+      frequency: 2,
+    })
+
+    const deleted = repo.deletePatternsByResult(resultId)
+    expect(deleted).toBe(2)
+    expect(repo.getPatternsByResult(resultId)).toHaveLength(0)
+  })
+
+  it('deletes a single pattern', () => {
+    repo.insertPattern({
+      id: 'fp-1',
+      benchmarkResultId: resultId,
+      category: 'wrong_format',
+      frequency: 1,
+    })
+    expect(repo.deletePattern('fp-1')).toBe(true)
+    expect(repo.getPattern('fp-1')).toBeUndefined()
+  })
+})
+
+// ============================================================================
+// Schema integrity
+// ============================================================================
+
+describe('BenchmarkRepository — schema integrity', () => {
+  it('all 3 tables exist after createTestDatabase()', () => {
+    const tables = db
+      .prepare(
+        `SELECT name FROM sqlite_master WHERE type='table'
+         AND name IN ('benchmark_results', 'skill_variants', 'failure_patterns')
+         ORDER BY name`
+      )
+      .all() as { name: string }[]
+
+    expect(tables.map((t) => t.name)).toEqual([
+      'benchmark_results',
+      'failure_patterns',
+      'skill_variants',
+    ])
+  })
+
+  it('indexes exist for benchmark_results', () => {
+    const indexes = db
+      .prepare(
+        `SELECT name FROM sqlite_master WHERE type='index'
+         AND name LIKE 'idx_benchmark_results%'`
+      )
+      .all() as { name: string }[]
+
+    const names = indexes.map((i) => i.name)
+    expect(names).toContain('idx_benchmark_results_skill')
+    expect(names).toContain('idx_benchmark_results_condition')
+  })
+
+  it('partial index exists for frontier variants', () => {
+    const indexes = db
+      .prepare(
+        `SELECT name FROM sqlite_master WHERE type='index'
+         AND name = 'idx_skill_variants_frontier'`
+      )
+      .all() as { name: string }[]
+
+    expect(indexes).toHaveLength(1)
+  })
+})
diff --git a/packages/core/tests/unit/migrations/v10-dependencies.test.ts b/packages/core/tests/unit/migrations/v10-dependencies.test.ts
index 30ac2b7c..47ff6844 100644
--- a/packages/core/tests/unit/migrations/v10-dependencies.test.ts
+++ b/packages/core/tests/unit/migrations/v10-dependencies.test.ts
@@ -79,9 +79,9 @@ describe('Migration v10: skill_dependencies table', () => {
     expect(() => db.exec(MIGRATION_V10_SQL)).not.toThrow()
   })
 
-  it('bumps schema version to 10', () => {
-    expect(getSchemaVersion(db)).toBe(10)
-    expect(SCHEMA_VERSION).toBe(10)
+  it('bumps schema version to latest', () => {
+    expect(getSchemaVersion(db)).toBe(SCHEMA_VERSION)
+    expect(SCHEMA_VERSION).toBeGreaterThanOrEqual(10)
   })
 
   it('unique index prevents duplicate (skill_id, dep_type, dep_target, dep_source)', () => {