Skip to content

Commit 6753e32

Browse files
author
StackMemory Bot (CLI)
committed
feat(conductor): multi-model routing, difficulty prediction, slim tarball
- Add complexity-based model selection: simple→Sonnet, complex→Opus with --model auto|<specific> on conductor start - Add predictDifficulty() scoring from labels, description, priority, and historical outcomes - Add conductor predict command for issue difficulty estimation - Add --predict flag on conductor learn for predicted-vs-actual comparison - Exclude scripts/gepa/ from npm tarball (~100KB savings) - Store labels in outcome entries for historical label-based analysis
1 parent ef1c109 commit 6753e32

File tree

3 files changed

+461
-19
lines changed

3 files changed

+461
-19
lines changed

package.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@stackmemoryai/stackmemory",
3-
"version": "1.6.0",
3+
"version": "1.6.1",
44
"description": "Lossless, project-scoped memory for AI coding tools. Durable context across sessions with 56 MCP tools, FTS5 search, conductor orchestrator, loop/watch monitoring, snapshot capture, pre-flight overlap checks, Claude/Codex/OpenCode wrappers, Linear sync, and automatic hooks.",
55
"engines": {
66
"node": ">=20.0.0",
@@ -20,7 +20,6 @@
2020
"files": [
2121
"bin",
2222
"dist/src",
23-
"scripts/gepa",
2423
"scripts/git-hooks",
2524
"scripts/hooks",
2625
"scripts/setup",

src/cli/commands/orchestrate.ts

Lines changed: 339 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,148 @@ function ensureDefaultPromptTemplate(): string {
332332
return templatePath;
333333
}
334334

335+
// ── Difficulty Prediction ──
336+
337+
export type DifficultyLevel = 'easy' | 'medium' | 'hard';
338+
339+
export interface DifficultyPrediction {
340+
difficulty: DifficultyLevel;
341+
confidence: number;
342+
reasons: string[];
343+
}
344+
345+
/**
346+
* Predict issue difficulty from labels, description, priority, and historical outcomes.
347+
* Used for model selection and retry budget allocation.
348+
*/
349+
export function predictDifficulty(
350+
labels: string[],
351+
description: string,
352+
priority: number,
353+
outcomes: AgentOutcomeEntry[]
354+
): DifficultyPrediction {
355+
let difficulty: DifficultyLevel = 'medium';
356+
let confidence = 0.5;
357+
const reasons: string[] = [];
358+
const lowerLabels = labels.map((l) => l.toLowerCase());
359+
360+
// Signal 1: Historical failure rate for matching labels
361+
if (outcomes.length > 0 && labels.length > 0) {
362+
const matching = outcomes.filter(
363+
(o) =>
364+
o.labels &&
365+
o.labels.some((ol) => lowerLabels.includes(ol.toLowerCase()))
366+
);
367+
if (matching.length >= 3) {
368+
const failRate =
369+
matching.filter((o) => o.outcome === 'failure').length /
370+
matching.length;
371+
if (failRate > 0.6) {
372+
difficulty = 'hard';
373+
confidence = Math.min(confidence + 0.1, 0.9);
374+
reasons.push(
375+
`Historical failure rate ${Math.round(failRate * 100)}% for similar labels`
376+
);
377+
} else if (failRate < 0.2) {
378+
difficulty = 'easy';
379+
confidence = Math.min(confidence + 0.1, 0.9);
380+
reasons.push(
381+
`Historical failure rate ${Math.round(failRate * 100)}% for similar labels`
382+
);
383+
}
384+
}
385+
}
386+
387+
// Signal 2: Short description + bug/fix label → likely easy
388+
const hasBugOrFix = lowerLabels.some(
389+
(l) => l === 'bug' || l === 'fix' || l.includes('bugfix')
390+
);
391+
if (description.length < 100 && hasBugOrFix) {
392+
if (difficulty !== 'easy') difficulty = 'easy';
393+
confidence = Math.min(confidence + 0.1, 0.9);
394+
reasons.push('Short description with bug/fix label suggests simple fix');
395+
}
396+
397+
// Signal 3: Long description or feature/refactor label → likely hard
398+
const hasComplexLabel = lowerLabels.some(
399+
(l) =>
400+
l === 'feature' ||
401+
l === 'refactor' ||
402+
l === 'refactoring' ||
403+
l === 'architecture'
404+
);
405+
if (description.length > 500 || hasComplexLabel) {
406+
if (difficulty !== 'hard') {
407+
difficulty =
408+
description.length > 500 && hasComplexLabel
409+
? 'hard'
410+
: difficulty === 'easy'
411+
? 'medium'
412+
: 'hard';
413+
}
414+
confidence = Math.min(confidence + 0.1, 0.9);
415+
if (description.length > 500) {
416+
reasons.push(
417+
`Long description (${description.length} chars) suggests complexity`
418+
);
419+
}
420+
if (hasComplexLabel) {
421+
reasons.push('Feature/refactor label suggests higher complexity');
422+
}
423+
}
424+
425+
// Signal 4: High priority → bump difficulty
426+
if (priority === 1 || priority === 2) {
427+
if (difficulty === 'easy') difficulty = 'medium';
428+
else if (difficulty === 'medium') difficulty = 'hard';
429+
confidence = Math.min(confidence + 0.1, 0.9);
430+
reasons.push(
431+
`Priority ${priority} (${priority === 1 ? 'urgent' : 'high'}) — higher difficulty expected`
432+
);
433+
}
434+
435+
// Signal 5: Historical avg toolCalls for similar labels > 80 → hard
436+
if (outcomes.length > 0 && labels.length > 0) {
437+
const matching = outcomes.filter(
438+
(o) =>
439+
o.labels &&
440+
o.labels.some((ol) => lowerLabels.includes(ol.toLowerCase()))
441+
);
442+
if (matching.length >= 3) {
443+
const avgToolCalls =
444+
matching.reduce((s, o) => s + o.toolCalls, 0) / matching.length;
445+
if (avgToolCalls > 80) {
446+
difficulty = 'hard';
447+
confidence = Math.min(confidence + 0.1, 0.9);
448+
reasons.push(
449+
`Historical avg tool calls ${Math.round(avgToolCalls)} for similar labels (>80)`
450+
);
451+
}
452+
}
453+
}
454+
455+
if (reasons.length === 0) {
456+
reasons.push('No strong signals — defaulting to medium');
457+
}
458+
459+
return { difficulty, confidence, reasons };
460+
}
461+
462+
/** Load outcomes from disk */
463+
function loadOutcomes(): AgentOutcomeEntry[] {
464+
const logPath = getOutcomesLogPath();
465+
if (!existsSync(logPath)) return [];
466+
try {
467+
return readFileSync(logPath, 'utf-8')
468+
.trim()
469+
.split('\n')
470+
.filter(Boolean)
471+
.map((l) => JSON.parse(l) as AgentOutcomeEntry);
472+
} catch {
473+
return [];
474+
}
475+
}
476+
335477
/**
336478
* Spawn `claude --print` with stdin prompt, return stdout.
337479
* Used by `conductor learn --evolve` to generate prompt mutations.
@@ -1113,6 +1255,11 @@ export function createConductorCommands(): Command {
11131255
'Auto-mutate prompt template using GEPA-style evolution from failure data',
11141256
false
11151257
)
1258+
.option(
1259+
'--predict',
1260+
'Show difficulty predictions alongside actual outcomes',
1261+
false
1262+
)
11161263
.action(async (options) => {
11171264
const logPath = getOutcomesLogPath();
11181265
if (!existsSync(logPath)) {
@@ -1344,9 +1491,195 @@ export function createConductorCommands(): Command {
13441491
});
13451492
}
13461493

1494+
// --- predict: show predicted vs actual difficulty ---
1495+
if (options.predict) {
1496+
console.log(
1497+
`\n ${c.b}${c.purple}Difficulty Predictions vs Actual${c.r}\n`
1498+
);
1499+
1500+
// Deduplicate by issue (use last outcome per issue)
1501+
const byIssue = new Map<string, AgentOutcomeEntry>();
1502+
for (const o of outcomes) {
1503+
byIssue.set(o.issue, o);
1504+
}
1505+
1506+
const difficultyColor = {
1507+
easy: c.green,
1508+
medium: c.yellow,
1509+
hard: c.red,
1510+
};
1511+
1512+
for (const [issue, outcome] of byIssue) {
1513+
const issueLabels = outcome.labels || [];
1514+
// Use all outcomes except this issue for prediction (leave-one-out)
1515+
const otherOutcomes = outcomes.filter((o) => o.issue !== issue);
1516+
const pred = predictDifficulty(
1517+
issueLabels,
1518+
'', // no description in outcome data
1519+
0, // no priority in outcome data
1520+
otherOutcomes
1521+
);
1522+
1523+
// Infer actual difficulty from outcome
1524+
const actualDifficulty: DifficultyLevel =
1525+
outcome.outcome === 'success' && outcome.toolCalls < 40
1526+
? 'easy'
1527+
: outcome.outcome === 'failure' || outcome.toolCalls > 80
1528+
? 'hard'
1529+
: 'medium';
1530+
1531+
const match = pred.difficulty === actualDifficulty;
1532+
const matchIcon = match ? `${c.green}${c.r}` : `${c.red}${c.r}`;
1533+
1534+
console.log(
1535+
` ${matchIcon} ${c.white}${issue.padEnd(12)}${c.r} predicted: ${difficultyColor[pred.difficulty]}${pred.difficulty.padEnd(6)}${c.r} actual: ${difficultyColor[actualDifficulty]}${actualDifficulty.padEnd(6)}${c.r} ${c.gray}(${Math.round(pred.confidence * 100)}% conf)${c.r}`
1536+
);
1537+
}
1538+
1539+
const issueList = [...byIssue.values()];
1540+
const correct = issueList.filter((o) => {
1541+
const otherOutcomes = outcomes.filter((oo) => oo.issue !== o.issue);
1542+
const pred = predictDifficulty(o.labels || [], '', 0, otherOutcomes);
1543+
const actual: DifficultyLevel =
1544+
o.outcome === 'success' && o.toolCalls < 40
1545+
? 'easy'
1546+
: o.outcome === 'failure' || o.toolCalls > 80
1547+
? 'hard'
1548+
: 'medium';
1549+
return pred.difficulty === actual;
1550+
}).length;
1551+
const accuracy = Math.round((correct / issueList.length) * 100);
1552+
console.log(
1553+
`\n ${c.b}Accuracy${c.r}: ${accuracy}% (${correct}/${issueList.length})`
1554+
);
1555+
}
1556+
13471557
console.log('');
13481558
});
13491559

1560+
// --- predict ---
1561+
cmd
1562+
.command('predict [issue-id]')
1563+
.description('Predict difficulty for an issue based on historical outcomes')
1564+
.option('--title <title>', 'Issue title (for testing without Linear)')
1565+
.option(
1566+
'--labels <labels>',
1567+
'Comma-separated labels (for testing without Linear)'
1568+
)
1569+
.option('--priority <n>', 'Priority 0-4 (for testing without Linear)', '0')
1570+
.option('--json', 'Output as JSON', false)
1571+
.action(async (issueId: string | undefined, options) => {
1572+
let title = options.title || '';
1573+
let labels: string[] = options.labels
1574+
? options.labels.split(',').map((l: string) => l.trim())
1575+
: [];
1576+
let description = '';
1577+
let priority = parseInt(options.priority, 10) || 0;
1578+
1579+
// If issue-id provided and no inline overrides, fetch from Linear
1580+
if (issueId && !options.title && !options.labels) {
1581+
try {
1582+
const { LinearClient } =
1583+
await import('../../integrations/linear/client.js');
1584+
const { LinearAuthManager } =
1585+
await import('../../integrations/linear/auth.js');
1586+
1587+
let client: InstanceType<typeof LinearClient>;
1588+
try {
1589+
const authManager = new LinearAuthManager(process.cwd());
1590+
const token = await authManager.getValidToken();
1591+
client = new LinearClient({ apiKey: token, useBearer: true });
1592+
} catch {
1593+
const apiKey = process.env.LINEAR_API_KEY;
1594+
if (!apiKey) {
1595+
console.log(
1596+
`${c.red}No Linear auth found.${c.r} Use --title/--labels/--priority for testing.`
1597+
);
1598+
return;
1599+
}
1600+
client = new LinearClient({ apiKey });
1601+
}
1602+
1603+
const issue = await client.getIssue(issueId);
1604+
if (!issue) {
1605+
console.log(`${c.red}Issue ${issueId} not found.${c.r}`);
1606+
return;
1607+
}
1608+
1609+
title = issue.title;
1610+
description = issue.description || '';
1611+
labels = issue.labels.map((l) => l.name);
1612+
priority = issue.priority;
1613+
} catch (err) {
1614+
console.log(
1615+
`${c.red}Failed to fetch issue:${c.r} ${(err as Error).message}`
1616+
);
1617+
return;
1618+
}
1619+
}
1620+
1621+
if (!issueId && !options.title) {
1622+
console.log(
1623+
`${c.yellow}Provide an issue ID or --title/--labels for testing.${c.r}`
1624+
);
1625+
return;
1626+
}
1627+
1628+
const outcomes = loadOutcomes();
1629+
const pred = predictDifficulty(labels, description, priority, outcomes);
1630+
1631+
if (options.json) {
1632+
console.log(
1633+
JSON.stringify(
1634+
{
1635+
issueId: issueId || 'inline',
1636+
title,
1637+
labels,
1638+
priority,
1639+
...pred,
1640+
},
1641+
null,
1642+
2
1643+
)
1644+
);
1645+
return;
1646+
}
1647+
1648+
const difficultyColor = {
1649+
easy: c.green,
1650+
medium: c.yellow,
1651+
hard: c.red,
1652+
};
1653+
1654+
console.log(`\n ${c.b}${c.purple}Difficulty Prediction${c.r}\n`);
1655+
if (title) {
1656+
console.log(` ${c.b}Issue${c.r} ${issueId || 'inline'}`);
1657+
console.log(` ${c.b}Title${c.r} ${title}`);
1658+
}
1659+
if (labels.length > 0) {
1660+
console.log(` ${c.b}Labels${c.r} ${labels.join(', ')}`);
1661+
}
1662+
if (priority > 0) {
1663+
console.log(` ${c.b}Priority${c.r} ${priority}`);
1664+
}
1665+
1666+
console.log(
1667+
`\n ${c.b}Difficulty${c.r} ${difficultyColor[pred.difficulty]}${pred.difficulty.toUpperCase()}${c.r}`
1668+
);
1669+
console.log(
1670+
` ${c.b}Confidence${c.r} ${Math.round(pred.confidence * 100)}%`
1671+
);
1672+
1673+
console.log(`\n ${c.b}Signals${c.r}`);
1674+
for (const reason of pred.reasons) {
1675+
console.log(` ${c.cyan}${c.r} ${reason}`);
1676+
}
1677+
1678+
console.log(
1679+
`\n ${c.d}Based on ${outcomes.length} historical outcomes${c.r}\n`
1680+
);
1681+
});
1682+
13501683
// --- usage ---
13511684
cmd
13521685
.command('usage')
@@ -1445,6 +1778,11 @@ export function createConductorCommands(): Command {
14451778
'Agent mode: "cli" (claude -p, session auth) or "adapter" (JSON-RPC, API key)',
14461779
'cli'
14471780
)
1781+
.option(
1782+
'--model <model>',
1783+
'Model routing: "auto" (complexity-based) or a specific model ID',
1784+
'auto'
1785+
)
14481786
.action(async (options) => {
14491787
// Ensure default prompt template exists on first start
14501788
ensureDefaultPromptTemplate();
@@ -1463,6 +1801,7 @@ export function createConductorCommands(): Command {
14631801
maxRetries: parseInt(options.retries, 10),
14641802
turnTimeoutMs: parseInt(options.turnTimeout, 10),
14651803
agentMode: options.mode === 'adapter' ? 'adapter' : 'cli',
1804+
model: options.model,
14661805
});
14671806

14681807
await conductor.start();

0 commit comments

Comments
 (0)