diff --git a/bench/src/flywheel-run.mts b/bench/src/flywheel-run.mts index a3481bec..f6f58880 100644 --- a/bench/src/flywheel-run.mts +++ b/bench/src/flywheel-run.mts @@ -88,12 +88,15 @@ async function main(): Promise { const { strategy: authored, file } = await authorStrategy({ chat, model: authorModel, - fallbackModel: process.env.AUTHOR_FALLBACK_MODEL ?? 'deepseek-v4-pro', + // The fallback is a FASTER model on purpose: the primary's failure modes are an + // empty reply (no maxTokens) or an edge 524 on a long generation — flash clears both. + fallbackModel: process.env.AUTHOR_FALLBACK_MODEL ?? 'deepseek-v4-flash', environmentName: surface.name, lossesJson: losses, budget, outDir: join(import.meta.dirname, 'authored'), temperature: 0.6, + maxTokens: 8192, }) console.error(` authored "${authored.name}" → ${file}\n`) diff --git a/bench/src/strategy-author.mts b/bench/src/strategy-author.mts index 47e31209..65c055a8 100644 --- a/bench/src/strategy-author.mts +++ b/bench/src/strategy-author.mts @@ -99,12 +99,13 @@ async function main(): Promise { const { strategy: authored, file } = await authorStrategy({ chat, model: authorModel, - fallbackModel: process.env.AUTHOR_FALLBACK_MODEL ?? 'deepseek-v4-pro', + fallbackModel: process.env.AUTHOR_FALLBACK_MODEL ?? 'deepseek-v4-flash', environmentName: environment.name, lossesJson: losses, budget, outDir: join(import.meta.dirname, 'authored'), temperature: 0.6, + maxTokens: 8192, }) console.error(` authored "${authored.name}" → ${file}`) console.error(' R0 PASS: loaded\n') diff --git a/src/runtime/run-benchmark.ts b/src/runtime/run-benchmark.ts index c057756d..aec90eff 100644 --- a/src/runtime/run-benchmark.ts +++ b/src/runtime/run-benchmark.ts @@ -14,6 +14,7 @@ */ import { pairedBootstrap, paretoFrontier } from '@tangle-network/agent-eval' +import type { RuntimeHooks } from '../runtime-hooks' import { type AgenticOptions, type AgenticSurface, @@ -45,6 +46,9 @@ export interface BenchmarkConfig { /** Progress hook — fires as each task settles (the live-monitoring seam: append to a * progress file, render a tree, stream to a dashboard). `done` counts settled tasks. */ onTask?: (row: BenchmarkTaskRow, done: number, total: number) => void + /** Lifecycle observability — every spawn/settle of every cell's shots/analysts streams + * here live (the watchdog/route-auditor seam, passed through to `runAgentic`). */ + hooks?: RuntimeHooks } export interface BenchmarkLift { @@ -138,6 +142,7 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise + shot(spec?: { handle?, messages?, steer?, persona? }): Promise Runs ONE worker attempt (a bounded tool loop) over an artifact. - omit handle => the shot opens its OWN fresh artifact and closes it after (a sample). - pass handle => the shot CONTINUES that artifact (state accumulates across shots). - messages => the carried conversation (pass the previous ShotResult.messages to continue). - steer => a corrective instruction injected before the shot. + - persona => { systemPrompt?, model? } — give THIS shot its own role and/or model + (multi-agent strategies: a researcher shot then an engineer shot, a panel of k + personas over one budget). On a fresh shot the systemPrompt replaces the task's; on + a carried conversation it arrives as a hand-off message. Same conserved budget. ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors } Returns null if the attempt failed infra-wise. @@ -62,6 +66,10 @@ export interface AuthorStrategyOptions { * empty content without `maxTokens`). Opt-in — absent means the primary's failure * propagates. */ fallbackModel?: string + /** The contract text shown to the author. Default `strategyAuthorContract`. The + * meta-optimization coordinate: a GEPA/skill loop can evolve this text and gate each + * variant on the same frozen holdout as any strategy. */ + contract?: string /** The environment the losses came from (orientation only — never the verifiers). */ environmentName: string /** The per-task losses table (e.g. JSON.stringify(report.perTask)) — the gradient. */ @@ -134,7 +142,7 @@ async function requestAuthoredCode( }, { role: 'user', - content: `${strategyAuthorContract}\n\nBASELINE RESULTS on the "${opts.environmentName}" environment (budget=${opts.budget}):\n${opts.lossesJson}\n\nAuthor ONE new strategy that you expect to beat the baselines on THIS environment at the same budget. Use the losses to target the observed failure mode. Output only the module code block.`, + content: `${opts.contract ?? strategyAuthorContract}\n\nBASELINE RESULTS on the "${opts.environmentName}" environment (budget=${opts.budget}):\n${opts.lossesJson}\n\nAuthor ONE new strategy that you expect to beat the baselines on THIS environment at the same budget. Use the losses to target the observed failure mode. Output only the module code block.`, }, ], }, diff --git a/src/runtime/strategy.ts b/src/runtime/strategy.ts index d5826c26..b218483e 100644 --- a/src/runtime/strategy.ts +++ b/src/runtime/strategy.ts @@ -91,6 +91,9 @@ export interface AgenticOptions { /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a * prompt optimizer (GEPA) tunes — the analyst IS the steerer. Omitted ⇒ the default. */ analystInstruction?: string + /** The critic's model — lets the analyst be a stronger (or cheaper) model than the + * worker. Omitted ⇒ the worker's `model`. */ + analystModel?: string /** Across-run learning: when set, the analyst's observe() pass appends trace-derived * facts here (the flywheel write side). Priming (the read side) is the caller's move — * query the corpus and fold facts into the task's systemPrompt before runAgentic. */ @@ -215,11 +218,12 @@ async function analyze(task: AgenticTask, messages: Msg[], opts: AgenticOptions) }) .join('\n') .slice(0, 7000) + const analystModel = opts.analystModel ?? opts.model const chat = createChatClient({ transport: 'router', apiKey: opts.routerKey, baseUrl: opts.routerBaseUrl, - defaultModel: opts.model, + defaultModel: analystModel, }) const obs = await observe( { @@ -231,7 +235,7 @@ async function analyze(task: AgenticTask, messages: Msg[], opts: AgenticOptions) }, { chat, - model: opts.model, + model: analystModel, ...(opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {}), ...(opts.corpus ? { corpus: opts.corpus, tags: opts.corpusTags ?? [] } : {}), }, diff --git a/tests/loops/strategy-suite.test.ts b/tests/loops/strategy-suite.test.ts index 939f0c50..d3eab348 100644 --- a/tests/loops/strategy-suite.test.ts +++ b/tests/loops/strategy-suite.test.ts @@ -10,16 +10,27 @@ * or the conserved compute dose. * - promotionGate: deterministic seeded verdict, minimum-evidence floor, CI margin. */ +import { mkdtempSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' import { afterEach, describe, expect, it, vi } from 'vitest' import { promotionGate } from '../../src/runtime/promotion-gate' -import type { BenchmarkReport, BenchmarkTaskRow } from '../../src/runtime/run-benchmark' +import { + type BenchmarkReport, + type BenchmarkTaskRow, + runBenchmark, +} from '../../src/runtime/run-benchmark' import { type AgenticSurface, type AgenticTask, defineStrategy, runAgentic, } from '../../src/runtime/strategy' -import { assertStrategyContract } from '../../src/runtime/strategy-author' +import { + assertStrategyContract, + authorStrategy, + strategyAuthorContract, +} from '../../src/runtime/strategy-author' // ── Fixtures ────────────────────────────────────────────────────────────────────── @@ -246,3 +257,77 @@ describe('promotionGate', () => { expect(v.reason).toBe('no-margin') }) }) + +// ── The author/optimizer addressability surface ─────────────────────────────────── + +describe('addressable optimization coordinates', () => { + it('the author contract exposes persona (multi-agent strategies are authorable)', () => { + expect(strategyAuthorContract).toContain('persona') + expect(strategyAuthorContract).toContain('systemPrompt') + }) + + it('analystModel routes the critique call to the critic model, not the worker', async () => { + const captured = stubRouter() + const surface = fixtureSurface(() => ({ passes: 0, total: 1 })) + const critiqued = defineStrategy('critiqued', async ({ shot, critique }) => { + const out = await shot() + if (out) await critique(out.messages) + return { score: 0, resolved: false, completions: 1, progression: [0], shots: 1 } + }) + await runAgentic({ + surface, + task, + ...worker, + analystModel: 'critic-model', + strategy: critiqued, + budget: 2, + }) + const models = captured.map((r) => (r as { model?: string }).model) + expect(models).toContain('test-model') + expect(models).toContain('critic-model') + }) + + it('runBenchmark passes lifecycle hooks through to every cell', async () => { + stubRouter() + const surface = fixtureSurface(() => ({ passes: 1, total: 1 })) + const events: string[] = [] + const oneShot = defineStrategy('one-shot', async ({ shot }) => { + await shot() + return { score: 0, resolved: false, completions: 1, progression: [0], shots: 1 } + }) + await runBenchmark({ + environment: surface, + tasks: [task], + worker, + strategies: [oneShot], + budget: 1, + concurrency: 1, + hooks: { onEvent: (e) => void events.push(e.type) }, + }) + expect(events.length).toBeGreaterThan(0) + }) + + it('authorStrategy uses a caller-supplied contract (the meta-optimization coordinate)', async () => { + const dir = mkdtempSync(join(tmpdir(), 'authored-test-')) + const seen: string[] = [] + const module = [ + "export default { name: 'noop', driver: () => ({ name: 'noop', act: async () => ({ kind: 'done', deliverable: {} }) }) }", + ].join('\n') + const chat = { + chat: async (req: { messages: Array<{ content: string }> }) => { + seen.push(req.messages.map((m) => m.content).join('\n')) + return { content: `\`\`\`ts\n${module}\n\`\`\`` } + }, + } as unknown as Parameters[0]['chat'] + const { strategy } = await authorStrategy({ + chat, + contract: 'CUSTOM CONTRACT vNEXT', + environmentName: 'fixture', + lossesJson: '[]', + budget: 2, + outDir: dir, + }) + expect(seen.join('\n')).toContain('CUSTOM CONTRACT vNEXT') + expect(strategy.name).toBe('noop') + }) +}) diff --git a/vitest.config.ts b/vitest.config.ts index d8e5d28a..b6d3bfec 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -2,6 +2,6 @@ import { defineConfig } from 'vitest/config' export default defineConfig({ test: { - exclude: ['**/node_modules/**', 'dist/**', 'bench/**'], + exclude: ['**/node_modules/**', 'dist/**', 'bench/**', '**/.claude/worktrees/**'], }, })