tangle-network · drewstone · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/bench/src/flywheel-run.mts b/bench/src/flywheel-run.mts
@@ -88,12 +88,15 @@ async function main(): Promise<void> {
   const { strategy: authored, file } = await authorStrategy({
     chat,
     model: authorModel,
-    fallbackModel: process.env.AUTHOR_FALLBACK_MODEL ?? 'deepseek-v4-pro',
+    // The fallback is a FASTER model on purpose: the primary's failure modes are an
+    // empty reply (no maxTokens) or an edge 524 on a long generation — flash clears both.
+    fallbackModel: process.env.AUTHOR_FALLBACK_MODEL ?? 'deepseek-v4-flash',
     environmentName: surface.name,
     lossesJson: losses,
     budget,
     outDir: join(import.meta.dirname, 'authored'),
     temperature: 0.6,
+    maxTokens: 8192,
   })
   console.error(`  authored "${authored.name}" → ${file}\n`)
 

diff --git a/bench/src/strategy-author.mts b/bench/src/strategy-author.mts
@@ -99,12 +99,13 @@ async function main(): Promise<void> {
   const { strategy: authored, file } = await authorStrategy({
     chat,
     model: authorModel,
-    fallbackModel: process.env.AUTHOR_FALLBACK_MODEL ?? 'deepseek-v4-pro',
+    fallbackModel: process.env.AUTHOR_FALLBACK_MODEL ?? 'deepseek-v4-flash',
     environmentName: environment.name,
     lossesJson: losses,
     budget,
     outDir: join(import.meta.dirname, 'authored'),
     temperature: 0.6,
+    maxTokens: 8192,
   })
   console.error(`  authored "${authored.name}" → ${file}`)
   console.error('  R0 PASS: loaded\n')

diff --git a/src/runtime/run-benchmark.ts b/src/runtime/run-benchmark.ts
@@ -14,6 +14,7 @@
  */
 
 import { pairedBootstrap, paretoFrontier } from '@tangle-network/agent-eval'
+import type { RuntimeHooks } from '../runtime-hooks'
 import {
   type AgenticOptions,
   type AgenticSurface,
@@ -45,6 +46,9 @@ export interface BenchmarkConfig {
   /** Progress hook — fires as each task settles (the live-monitoring seam: append to a
    *  progress file, render a tree, stream to a dashboard). `done` counts settled tasks. */
   onTask?: (row: BenchmarkTaskRow, done: number, total: number) => void
+  /** Lifecycle observability — every spawn/settle of every cell's shots/analysts streams
+   *  here live (the watchdog/route-auditor seam, passed through to `runAgentic`). */
+  hooks?: RuntimeHooks
 }
 
 export interface BenchmarkLift {
@@ -138,6 +142,7 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkRepor
           task,
           strategy: s,
           budget,
+          ...(cfg.hooks ? { hooks: cfg.hooks } : {}),
         })
         cells[s.name] = {
           score: r.score,

diff --git a/src/runtime/strategy-author.ts b/src/runtime/strategy-author.ts
@@ -22,12 +22,16 @@ export const strategyAuthorContract = `
 You author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to
 spend a compute budget to beat a task's deployable check. You compose exactly two steps:
 
-  shot(spec?: { handle?, messages?, steer? }): Promise<ShotResult | null>
+  shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>
     Runs ONE worker attempt (a bounded tool loop) over an artifact.
     - omit handle  => the shot opens its OWN fresh artifact and closes it after (a sample).
     - pass handle  => the shot CONTINUES that artifact (state accumulates across shots).
     - messages     => the carried conversation (pass the previous ShotResult.messages to continue).
     - steer        => a corrective instruction injected before the shot.
+    - persona      => { systemPrompt?, model? } — give THIS shot its own role and/or model
+      (multi-agent strategies: a researcher shot then an engineer shot, a panel of k
+      personas over one budget). On a fresh shot the systemPrompt replaces the task's; on
+      a carried conversation it arrives as a hand-off message. Same conserved budget.
     ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }
     Returns null if the attempt failed infra-wise.
 
@@ -62,6 +66,10 @@ export interface AuthorStrategyOptions {
    *  empty content without `maxTokens`). Opt-in — absent means the primary's failure
    *  propagates. */
   fallbackModel?: string
+  /** The contract text shown to the author. Default `strategyAuthorContract`. The
+   *  meta-optimization coordinate: a GEPA/skill loop can evolve this text and gate each
+   *  variant on the same frozen holdout as any strategy. */
+  contract?: string
   /** The environment the losses came from (orientation only — never the verifiers). */
   environmentName: string
   /** The per-task losses table (e.g. JSON.stringify(report.perTask)) — the gradient. */
@@ -134,7 +142,7 @@ async function requestAuthoredCode(
         },
         {
           role: 'user',
-          content: `${strategyAuthorContract}\n\nBASELINE RESULTS on the "${opts.environmentName}" environment (budget=${opts.budget}):\n${opts.lossesJson}\n\nAuthor ONE new strategy that you expect to beat the baselines on THIS environment at the same budget. Use the losses to target the observed failure mode. Output only the module code block.`,
+          content: `${opts.contract ?? strategyAuthorContract}\n\nBASELINE RESULTS on the "${opts.environmentName}" environment (budget=${opts.budget}):\n${opts.lossesJson}\n\nAuthor ONE new strategy that you expect to beat the baselines on THIS environment at the same budget. Use the losses to target the observed failure mode. Output only the module code block.`,
         },
       ],
     },

diff --git a/src/runtime/strategy.ts b/src/runtime/strategy.ts
@@ -91,6 +91,9 @@ export interface AgenticOptions {
   /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
    *  prompt optimizer (GEPA) tunes — the analyst IS the steerer. Omitted ⇒ the default. */
   analystInstruction?: string
+  /** The critic's model — lets the analyst be a stronger (or cheaper) model than the
+   *  worker. Omitted ⇒ the worker's `model`. */
+  analystModel?: string
   /** Across-run learning: when set, the analyst's observe() pass appends trace-derived
    *  facts here (the flywheel write side). Priming (the read side) is the caller's move —
    *  query the corpus and fold facts into the task's systemPrompt before runAgentic. */
@@ -215,11 +218,12 @@ async function analyze(task: AgenticTask, messages: Msg[], opts: AgenticOptions)
     })
     .join('\n')
     .slice(0, 7000)
+  const analystModel = opts.analystModel ?? opts.model
   const chat = createChatClient({
     transport: 'router',
     apiKey: opts.routerKey,
     baseUrl: opts.routerBaseUrl,
-    defaultModel: opts.model,
+    defaultModel: analystModel,
   })
   const obs = await observe(
     {
@@ -231,7 +235,7 @@ async function analyze(task: AgenticTask, messages: Msg[], opts: AgenticOptions)
     },
     {
       chat,
-      model: opts.model,
+      model: analystModel,
       ...(opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {}),
       ...(opts.corpus ? { corpus: opts.corpus, tags: opts.corpusTags ?? [] } : {}),
     },

diff --git a/tests/loops/strategy-suite.test.ts b/tests/loops/strategy-suite.test.ts
@@ -10,16 +10,27 @@
  *     or the conserved compute dose.
  *   - promotionGate: deterministic seeded verdict, minimum-evidence floor, CI margin.
  */
+import { mkdtempSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
 import { afterEach, describe, expect, it, vi } from 'vitest'
 import { promotionGate } from '../../src/runtime/promotion-gate'
-import type { BenchmarkReport, BenchmarkTaskRow } from '../../src/runtime/run-benchmark'
+import {
+  type BenchmarkReport,
+  type BenchmarkTaskRow,
+  runBenchmark,
+} from '../../src/runtime/run-benchmark'
 import {
   type AgenticSurface,
   type AgenticTask,
   defineStrategy,
   runAgentic,
 } from '../../src/runtime/strategy'
-import { assertStrategyContract } from '../../src/runtime/strategy-author'
+import {
+  assertStrategyContract,
+  authorStrategy,
+  strategyAuthorContract,
+} from '../../src/runtime/strategy-author'
 
 // ── Fixtures ──────────────────────────────────────────────────────────────────────
 
@@ -246,3 +257,77 @@ describe('promotionGate', () => {
     expect(v.reason).toBe('no-margin')
   })
 })
+
+// ── The author/optimizer addressability surface ───────────────────────────────────
+
+describe('addressable optimization coordinates', () => {
+  it('the author contract exposes persona (multi-agent strategies are authorable)', () => {
+    expect(strategyAuthorContract).toContain('persona')
+    expect(strategyAuthorContract).toContain('systemPrompt')
+  })
+
+  it('analystModel routes the critique call to the critic model, not the worker', async () => {
+    const captured = stubRouter()
+    const surface = fixtureSurface(() => ({ passes: 0, total: 1 }))
+    const critiqued = defineStrategy('critiqued', async ({ shot, critique }) => {
+      const out = await shot()
+      if (out) await critique(out.messages)
+      return { score: 0, resolved: false, completions: 1, progression: [0], shots: 1 }
+    })
+    await runAgentic({
+      surface,
+      task,
+      ...worker,
+      analystModel: 'critic-model',
+      strategy: critiqued,
+      budget: 2,
+    })
+    const models = captured.map((r) => (r as { model?: string }).model)
+    expect(models).toContain('test-model')
+    expect(models).toContain('critic-model')
+  })
+
+  it('runBenchmark passes lifecycle hooks through to every cell', async () => {
+    stubRouter()
+    const surface = fixtureSurface(() => ({ passes: 1, total: 1 }))
+    const events: string[] = []
+    const oneShot = defineStrategy('one-shot', async ({ shot }) => {
+      await shot()
+      return { score: 0, resolved: false, completions: 1, progression: [0], shots: 1 }
+    })
+    await runBenchmark({
+      environment: surface,
+      tasks: [task],
+      worker,
+      strategies: [oneShot],
+      budget: 1,
+      concurrency: 1,
+      hooks: { onEvent: (e) => void events.push(e.type) },
+    })
+    expect(events.length).toBeGreaterThan(0)
+  })
+
+  it('authorStrategy uses a caller-supplied contract (the meta-optimization coordinate)', async () => {
+    const dir = mkdtempSync(join(tmpdir(), 'authored-test-'))
+    const seen: string[] = []
+    const module = [
+      "export default { name: 'noop', driver: () => ({ name: 'noop', act: async () => ({ kind: 'done', deliverable: {} }) }) }",
+    ].join('\n')
+    const chat = {
+      chat: async (req: { messages: Array<{ content: string }> }) => {
+        seen.push(req.messages.map((m) => m.content).join('\n'))
+        return { content: `\`\`\`ts\n${module}\n\`\`\`` }
+      },
+    } as unknown as Parameters<typeof authorStrategy>[0]['chat']
+    const { strategy } = await authorStrategy({
+      chat,
+      contract: 'CUSTOM CONTRACT vNEXT',
+      environmentName: 'fixture',
+      lossesJson: '[]',
+      budget: 2,
+      outDir: dir,
+    })
+    expect(seen.join('\n')).toContain('CUSTOM CONTRACT vNEXT')
+    expect(strategy.name).toBe('noop')
+  })
+})
diff --git a/vitest.config.ts b/vitest.config.ts
@@ -2,6 +2,6 @@ import { defineConfig } from 'vitest/config'
 
 export default defineConfig({
   test: {
-    exclude: ['**/node_modules/**', 'dist/**', 'bench/**'],
+    exclude: ['**/node_modules/**', 'dist/**', 'bench/**', '**/.claude/worktrees/**'],
   },
 })