@@ -332,6 +332,148 @@ function ensureDefaultPromptTemplate(): string {
332332 return templatePath ;
333333}
334334
335+ // ── Difficulty Prediction ──
336+
337+ export type DifficultyLevel = 'easy' | 'medium' | 'hard' ;
338+
339+ export interface DifficultyPrediction {
340+ difficulty : DifficultyLevel ;
341+ confidence : number ;
342+ reasons : string [ ] ;
343+ }
344+
345+ /**
346+ * Predict issue difficulty from labels, description, priority, and historical outcomes.
347+ * Used for model selection and retry budget allocation.
348+ */
349+ export function predictDifficulty (
350+ labels : string [ ] ,
351+ description : string ,
352+ priority : number ,
353+ outcomes : AgentOutcomeEntry [ ]
354+ ) : DifficultyPrediction {
355+ let difficulty : DifficultyLevel = 'medium' ;
356+ let confidence = 0.5 ;
357+ const reasons : string [ ] = [ ] ;
358+ const lowerLabels = labels . map ( ( l ) => l . toLowerCase ( ) ) ;
359+
360+ // Signal 1: Historical failure rate for matching labels
361+ if ( outcomes . length > 0 && labels . length > 0 ) {
362+ const matching = outcomes . filter (
363+ ( o ) =>
364+ o . labels &&
365+ o . labels . some ( ( ol ) => lowerLabels . includes ( ol . toLowerCase ( ) ) )
366+ ) ;
367+ if ( matching . length >= 3 ) {
368+ const failRate =
369+ matching . filter ( ( o ) => o . outcome === 'failure' ) . length /
370+ matching . length ;
371+ if ( failRate > 0.6 ) {
372+ difficulty = 'hard' ;
373+ confidence = Math . min ( confidence + 0.1 , 0.9 ) ;
374+ reasons . push (
375+ `Historical failure rate ${ Math . round ( failRate * 100 ) } % for similar labels`
376+ ) ;
377+ } else if ( failRate < 0.2 ) {
378+ difficulty = 'easy' ;
379+ confidence = Math . min ( confidence + 0.1 , 0.9 ) ;
380+ reasons . push (
381+ `Historical failure rate ${ Math . round ( failRate * 100 ) } % for similar labels`
382+ ) ;
383+ }
384+ }
385+ }
386+
387+ // Signal 2: Short description + bug/fix label → likely easy
388+ const hasBugOrFix = lowerLabels . some (
389+ ( l ) => l === 'bug' || l === 'fix' || l . includes ( 'bugfix' )
390+ ) ;
391+ if ( description . length < 100 && hasBugOrFix ) {
392+ if ( difficulty !== 'easy' ) difficulty = 'easy' ;
393+ confidence = Math . min ( confidence + 0.1 , 0.9 ) ;
394+ reasons . push ( 'Short description with bug/fix label suggests simple fix' ) ;
395+ }
396+
397+ // Signal 3: Long description or feature/refactor label → likely hard
398+ const hasComplexLabel = lowerLabels . some (
399+ ( l ) =>
400+ l === 'feature' ||
401+ l === 'refactor' ||
402+ l === 'refactoring' ||
403+ l === 'architecture'
404+ ) ;
405+ if ( description . length > 500 || hasComplexLabel ) {
406+ if ( difficulty !== 'hard' ) {
407+ difficulty =
408+ description . length > 500 && hasComplexLabel
409+ ? 'hard'
410+ : difficulty === 'easy'
411+ ? 'medium'
412+ : 'hard' ;
413+ }
414+ confidence = Math . min ( confidence + 0.1 , 0.9 ) ;
415+ if ( description . length > 500 ) {
416+ reasons . push (
417+ `Long description (${ description . length } chars) suggests complexity`
418+ ) ;
419+ }
420+ if ( hasComplexLabel ) {
421+ reasons . push ( 'Feature/refactor label suggests higher complexity' ) ;
422+ }
423+ }
424+
425+ // Signal 4: High priority → bump difficulty
426+ if ( priority === 1 || priority === 2 ) {
427+ if ( difficulty === 'easy' ) difficulty = 'medium' ;
428+ else if ( difficulty === 'medium' ) difficulty = 'hard' ;
429+ confidence = Math . min ( confidence + 0.1 , 0.9 ) ;
430+ reasons . push (
431+ `Priority ${ priority } (${ priority === 1 ? 'urgent' : 'high' } ) — higher difficulty expected`
432+ ) ;
433+ }
434+
435+ // Signal 5: Historical avg toolCalls for similar labels > 80 → hard
436+ if ( outcomes . length > 0 && labels . length > 0 ) {
437+ const matching = outcomes . filter (
438+ ( o ) =>
439+ o . labels &&
440+ o . labels . some ( ( ol ) => lowerLabels . includes ( ol . toLowerCase ( ) ) )
441+ ) ;
442+ if ( matching . length >= 3 ) {
443+ const avgToolCalls =
444+ matching . reduce ( ( s , o ) => s + o . toolCalls , 0 ) / matching . length ;
445+ if ( avgToolCalls > 80 ) {
446+ difficulty = 'hard' ;
447+ confidence = Math . min ( confidence + 0.1 , 0.9 ) ;
448+ reasons . push (
449+ `Historical avg tool calls ${ Math . round ( avgToolCalls ) } for similar labels (>80)`
450+ ) ;
451+ }
452+ }
453+ }
454+
455+ if ( reasons . length === 0 ) {
456+ reasons . push ( 'No strong signals — defaulting to medium' ) ;
457+ }
458+
459+ return { difficulty, confidence, reasons } ;
460+ }
461+
462+ /** Load outcomes from disk */
463+ function loadOutcomes ( ) : AgentOutcomeEntry [ ] {
464+ const logPath = getOutcomesLogPath ( ) ;
465+ if ( ! existsSync ( logPath ) ) return [ ] ;
466+ try {
467+ return readFileSync ( logPath , 'utf-8' )
468+ . trim ( )
469+ . split ( '\n' )
470+ . filter ( Boolean )
471+ . map ( ( l ) => JSON . parse ( l ) as AgentOutcomeEntry ) ;
472+ } catch {
473+ return [ ] ;
474+ }
475+ }
476+
335477/**
336478 * Spawn `claude --print` with stdin prompt, return stdout.
337479 * Used by `conductor learn --evolve` to generate prompt mutations.
@@ -1113,6 +1255,11 @@ export function createConductorCommands(): Command {
11131255 'Auto-mutate prompt template using GEPA-style evolution from failure data' ,
11141256 false
11151257 )
1258+ . option (
1259+ '--predict' ,
1260+ 'Show difficulty predictions alongside actual outcomes' ,
1261+ false
1262+ )
11161263 . action ( async ( options ) => {
11171264 const logPath = getOutcomesLogPath ( ) ;
11181265 if ( ! existsSync ( logPath ) ) {
@@ -1344,9 +1491,195 @@ export function createConductorCommands(): Command {
13441491 } ) ;
13451492 }
13461493
1494+ // --- predict: show predicted vs actual difficulty ---
1495+ if ( options . predict ) {
1496+ console . log (
1497+ `\n ${ c . b } ${ c . purple } Difficulty Predictions vs Actual${ c . r } \n`
1498+ ) ;
1499+
1500+ // Deduplicate by issue (use last outcome per issue)
1501+ const byIssue = new Map < string , AgentOutcomeEntry > ( ) ;
1502+ for ( const o of outcomes ) {
1503+ byIssue . set ( o . issue , o ) ;
1504+ }
1505+
1506+ const difficultyColor = {
1507+ easy : c . green ,
1508+ medium : c . yellow ,
1509+ hard : c . red ,
1510+ } ;
1511+
1512+ for ( const [ issue , outcome ] of byIssue ) {
1513+ const issueLabels = outcome . labels || [ ] ;
1514+ // Use all outcomes except this issue for prediction (leave-one-out)
1515+ const otherOutcomes = outcomes . filter ( ( o ) => o . issue !== issue ) ;
1516+ const pred = predictDifficulty (
1517+ issueLabels ,
1518+ '' , // no description in outcome data
1519+ 0 , // no priority in outcome data
1520+ otherOutcomes
1521+ ) ;
1522+
1523+ // Infer actual difficulty from outcome
1524+ const actualDifficulty : DifficultyLevel =
1525+ outcome . outcome === 'success' && outcome . toolCalls < 40
1526+ ? 'easy'
1527+ : outcome . outcome === 'failure' || outcome . toolCalls > 80
1528+ ? 'hard'
1529+ : 'medium' ;
1530+
1531+ const match = pred . difficulty === actualDifficulty ;
1532+ const matchIcon = match ? `${ c . green } ✓${ c . r } ` : `${ c . red } ✗${ c . r } ` ;
1533+
1534+ console . log (
1535+ ` ${ matchIcon } ${ c . white } ${ issue . padEnd ( 12 ) } ${ c . r } predicted: ${ difficultyColor [ pred . difficulty ] } ${ pred . difficulty . padEnd ( 6 ) } ${ c . r } actual: ${ difficultyColor [ actualDifficulty ] } ${ actualDifficulty . padEnd ( 6 ) } ${ c . r } ${ c . gray } (${ Math . round ( pred . confidence * 100 ) } % conf)${ c . r } `
1536+ ) ;
1537+ }
1538+
1539+ const issueList = [ ...byIssue . values ( ) ] ;
1540+ const correct = issueList . filter ( ( o ) => {
1541+ const otherOutcomes = outcomes . filter ( ( oo ) => oo . issue !== o . issue ) ;
1542+ const pred = predictDifficulty ( o . labels || [ ] , '' , 0 , otherOutcomes ) ;
1543+ const actual : DifficultyLevel =
1544+ o . outcome === 'success' && o . toolCalls < 40
1545+ ? 'easy'
1546+ : o . outcome === 'failure' || o . toolCalls > 80
1547+ ? 'hard'
1548+ : 'medium' ;
1549+ return pred . difficulty === actual ;
1550+ } ) . length ;
1551+ const accuracy = Math . round ( ( correct / issueList . length ) * 100 ) ;
1552+ console . log (
1553+ `\n ${ c . b } Accuracy${ c . r } : ${ accuracy } % (${ correct } /${ issueList . length } )`
1554+ ) ;
1555+ }
1556+
13471557 console . log ( '' ) ;
13481558 } ) ;
13491559
1560+ // --- predict ---
1561+ cmd
1562+ . command ( 'predict [issue-id]' )
1563+ . description ( 'Predict difficulty for an issue based on historical outcomes' )
1564+ . option ( '--title <title>' , 'Issue title (for testing without Linear)' )
1565+ . option (
1566+ '--labels <labels>' ,
1567+ 'Comma-separated labels (for testing without Linear)'
1568+ )
1569+ . option ( '--priority <n>' , 'Priority 0-4 (for testing without Linear)' , '0' )
1570+ . option ( '--json' , 'Output as JSON' , false )
1571+ . action ( async ( issueId : string | undefined , options ) => {
1572+ let title = options . title || '' ;
1573+ let labels : string [ ] = options . labels
1574+ ? options . labels . split ( ',' ) . map ( ( l : string ) => l . trim ( ) )
1575+ : [ ] ;
1576+ let description = '' ;
1577+ let priority = parseInt ( options . priority , 10 ) || 0 ;
1578+
1579+ // If issue-id provided and no inline overrides, fetch from Linear
1580+ if ( issueId && ! options . title && ! options . labels ) {
1581+ try {
1582+ const { LinearClient } =
1583+ await import ( '../../integrations/linear/client.js' ) ;
1584+ const { LinearAuthManager } =
1585+ await import ( '../../integrations/linear/auth.js' ) ;
1586+
1587+ let client : InstanceType < typeof LinearClient > ;
1588+ try {
1589+ const authManager = new LinearAuthManager ( process . cwd ( ) ) ;
1590+ const token = await authManager . getValidToken ( ) ;
1591+ client = new LinearClient ( { apiKey : token , useBearer : true } ) ;
1592+ } catch {
1593+ const apiKey = process . env . LINEAR_API_KEY ;
1594+ if ( ! apiKey ) {
1595+ console . log (
1596+ `${ c . red } No Linear auth found.${ c . r } Use --title/--labels/--priority for testing.`
1597+ ) ;
1598+ return ;
1599+ }
1600+ client = new LinearClient ( { apiKey } ) ;
1601+ }
1602+
1603+ const issue = await client . getIssue ( issueId ) ;
1604+ if ( ! issue ) {
1605+ console . log ( `${ c . red } Issue ${ issueId } not found.${ c . r } ` ) ;
1606+ return ;
1607+ }
1608+
1609+ title = issue . title ;
1610+ description = issue . description || '' ;
1611+ labels = issue . labels . map ( ( l ) => l . name ) ;
1612+ priority = issue . priority ;
1613+ } catch ( err ) {
1614+ console . log (
1615+ `${ c . red } Failed to fetch issue:${ c . r } ${ ( err as Error ) . message } `
1616+ ) ;
1617+ return ;
1618+ }
1619+ }
1620+
1621+ if ( ! issueId && ! options . title ) {
1622+ console . log (
1623+ `${ c . yellow } Provide an issue ID or --title/--labels for testing.${ c . r } `
1624+ ) ;
1625+ return ;
1626+ }
1627+
1628+ const outcomes = loadOutcomes ( ) ;
1629+ const pred = predictDifficulty ( labels , description , priority , outcomes ) ;
1630+
1631+ if ( options . json ) {
1632+ console . log (
1633+ JSON . stringify (
1634+ {
1635+ issueId : issueId || 'inline' ,
1636+ title,
1637+ labels,
1638+ priority,
1639+ ...pred ,
1640+ } ,
1641+ null ,
1642+ 2
1643+ )
1644+ ) ;
1645+ return ;
1646+ }
1647+
1648+ const difficultyColor = {
1649+ easy : c . green ,
1650+ medium : c . yellow ,
1651+ hard : c . red ,
1652+ } ;
1653+
1654+ console . log ( `\n ${ c . b } ${ c . purple } Difficulty Prediction${ c . r } \n` ) ;
1655+ if ( title ) {
1656+ console . log ( ` ${ c . b } Issue${ c . r } ${ issueId || 'inline' } ` ) ;
1657+ console . log ( ` ${ c . b } Title${ c . r } ${ title } ` ) ;
1658+ }
1659+ if ( labels . length > 0 ) {
1660+ console . log ( ` ${ c . b } Labels${ c . r } ${ labels . join ( ', ' ) } ` ) ;
1661+ }
1662+ if ( priority > 0 ) {
1663+ console . log ( ` ${ c . b } Priority${ c . r } ${ priority } ` ) ;
1664+ }
1665+
1666+ console . log (
1667+ `\n ${ c . b } Difficulty${ c . r } ${ difficultyColor [ pred . difficulty ] } ${ pred . difficulty . toUpperCase ( ) } ${ c . r } `
1668+ ) ;
1669+ console . log (
1670+ ` ${ c . b } Confidence${ c . r } ${ Math . round ( pred . confidence * 100 ) } %`
1671+ ) ;
1672+
1673+ console . log ( `\n ${ c . b } Signals${ c . r } ` ) ;
1674+ for ( const reason of pred . reasons ) {
1675+ console . log ( ` ${ c . cyan } →${ c . r } ${ reason } ` ) ;
1676+ }
1677+
1678+ console . log (
1679+ `\n ${ c . d } Based on ${ outcomes . length } historical outcomes${ c . r } \n`
1680+ ) ;
1681+ } ) ;
1682+
13501683 // --- usage ---
13511684 cmd
13521685 . command ( 'usage' )
@@ -1445,6 +1778,11 @@ export function createConductorCommands(): Command {
14451778 'Agent mode: "cli" (claude -p, session auth) or "adapter" (JSON-RPC, API key)' ,
14461779 'cli'
14471780 )
1781+ . option (
1782+ '--model <model>' ,
1783+ 'Model routing: "auto" (complexity-based) or a specific model ID' ,
1784+ 'auto'
1785+ )
14481786 . action ( async ( options ) => {
14491787 // Ensure default prompt template exists on first start
14501788 ensureDefaultPromptTemplate ( ) ;
@@ -1463,6 +1801,7 @@ export function createConductorCommands(): Command {
14631801 maxRetries : parseInt ( options . retries , 10 ) ,
14641802 turnTimeoutMs : parseInt ( options . turnTimeout , 10 ) ,
14651803 agentMode : options . mode === 'adapter' ? 'adapter' : 'cli' ,
1804+ model : options . model ,
14661805 } ) ;
14671806
14681807 await conductor . start ( ) ;
0 commit comments