Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion bench/src/flywheel-run.mts
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,15 @@ async function main(): Promise<void> {
const { strategy: authored, file } = await authorStrategy({
chat,
model: authorModel,
fallbackModel: process.env.AUTHOR_FALLBACK_MODEL ?? 'deepseek-v4-pro',
// The fallback is a FASTER model on purpose: the primary's failure modes are an
// empty reply (no maxTokens) or an edge 524 on a long generation — flash clears both.
fallbackModel: process.env.AUTHOR_FALLBACK_MODEL ?? 'deepseek-v4-flash',
environmentName: surface.name,
lossesJson: losses,
budget,
outDir: join(import.meta.dirname, 'authored'),
temperature: 0.6,
maxTokens: 8192,
})
console.error(` authored "${authored.name}" → ${file}\n`)

Expand Down
3 changes: 2 additions & 1 deletion bench/src/strategy-author.mts
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,13 @@ async function main(): Promise<void> {
const { strategy: authored, file } = await authorStrategy({
chat,
model: authorModel,
fallbackModel: process.env.AUTHOR_FALLBACK_MODEL ?? 'deepseek-v4-pro',
fallbackModel: process.env.AUTHOR_FALLBACK_MODEL ?? 'deepseek-v4-flash',
environmentName: environment.name,
lossesJson: losses,
budget,
outDir: join(import.meta.dirname, 'authored'),
temperature: 0.6,
maxTokens: 8192,
})
console.error(` authored "${authored.name}" → ${file}`)
console.error(' R0 PASS: loaded\n')
Expand Down
5 changes: 5 additions & 0 deletions src/runtime/run-benchmark.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
*/

import { pairedBootstrap, paretoFrontier } from '@tangle-network/agent-eval'
import type { RuntimeHooks } from '../runtime-hooks'
import {
type AgenticOptions,
type AgenticSurface,
Expand Down Expand Up @@ -45,6 +46,9 @@ export interface BenchmarkConfig {
/** Progress hook — fires as each task settles (the live-monitoring seam: append to a
* progress file, render a tree, stream to a dashboard). `done` counts settled tasks. */
onTask?: (row: BenchmarkTaskRow, done: number, total: number) => void
/** Lifecycle observability — every spawn/settle of every cell's shots/analysts streams
* here live (the watchdog/route-auditor seam, passed through to `runAgentic`). */
hooks?: RuntimeHooks
}

export interface BenchmarkLift {
Expand Down Expand Up @@ -138,6 +142,7 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkRepor
task,
strategy: s,
budget,
...(cfg.hooks ? { hooks: cfg.hooks } : {}),
})
cells[s.name] = {
score: r.score,
Expand Down
12 changes: 10 additions & 2 deletions src/runtime/strategy-author.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,16 @@ export const strategyAuthorContract = `
You author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to
spend a compute budget to beat a task's deployable check. You compose exactly two steps:

shot(spec?: { handle?, messages?, steer? }): Promise<ShotResult | null>
shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>
Runs ONE worker attempt (a bounded tool loop) over an artifact.
- omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).
- pass handle => the shot CONTINUES that artifact (state accumulates across shots).
- messages => the carried conversation (pass the previous ShotResult.messages to continue).
- steer => a corrective instruction injected before the shot.
- persona => { systemPrompt?, model? } — give THIS shot its own role and/or model
(multi-agent strategies: a researcher shot then an engineer shot, a panel of k
personas over one budget). On a fresh shot the systemPrompt replaces the task's; on
a carried conversation it arrives as a hand-off message. Same conserved budget.
ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }
Returns null if the attempt failed infra-wise.

Expand Down Expand Up @@ -62,6 +66,10 @@ export interface AuthorStrategyOptions {
* empty content without `maxTokens`). Opt-in — absent means the primary's failure
* propagates. */
fallbackModel?: string
/** The contract text shown to the author. Default `strategyAuthorContract`. The
* meta-optimization coordinate: a GEPA/skill loop can evolve this text and gate each
* variant on the same frozen holdout as any strategy. */
contract?: string
/** The environment the losses came from (orientation only — never the verifiers). */
environmentName: string
/** The per-task losses table (e.g. JSON.stringify(report.perTask)) — the gradient. */
Expand Down Expand Up @@ -134,7 +142,7 @@ async function requestAuthoredCode(
},
{
role: 'user',
content: `${strategyAuthorContract}\n\nBASELINE RESULTS on the "${opts.environmentName}" environment (budget=${opts.budget}):\n${opts.lossesJson}\n\nAuthor ONE new strategy that you expect to beat the baselines on THIS environment at the same budget. Use the losses to target the observed failure mode. Output only the module code block.`,
content: `${opts.contract ?? strategyAuthorContract}\n\nBASELINE RESULTS on the "${opts.environmentName}" environment (budget=${opts.budget}):\n${opts.lossesJson}\n\nAuthor ONE new strategy that you expect to beat the baselines on THIS environment at the same budget. Use the losses to target the observed failure mode. Output only the module code block.`,
},
],
},
Expand Down
8 changes: 6 additions & 2 deletions src/runtime/strategy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ export interface AgenticOptions {
/** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
* prompt optimizer (GEPA) tunes — the analyst IS the steerer. Omitted ⇒ the default. */
analystInstruction?: string
/** The critic's model — lets the analyst be a stronger (or cheaper) model than the
* worker. Omitted ⇒ the worker's `model`. */
analystModel?: string
/** Across-run learning: when set, the analyst's observe() pass appends trace-derived
* facts here (the flywheel write side). Priming (the read side) is the caller's move —
* query the corpus and fold facts into the task's systemPrompt before runAgentic. */
Expand Down Expand Up @@ -215,11 +218,12 @@ async function analyze(task: AgenticTask, messages: Msg[], opts: AgenticOptions)
})
.join('\n')
.slice(0, 7000)
const analystModel = opts.analystModel ?? opts.model
const chat = createChatClient({
transport: 'router',
apiKey: opts.routerKey,
baseUrl: opts.routerBaseUrl,
defaultModel: opts.model,
defaultModel: analystModel,
})
const obs = await observe(
{
Expand All @@ -231,7 +235,7 @@ async function analyze(task: AgenticTask, messages: Msg[], opts: AgenticOptions)
},
{
chat,
model: opts.model,
model: analystModel,
...(opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {}),
...(opts.corpus ? { corpus: opts.corpus, tags: opts.corpusTags ?? [] } : {}),
},
Expand Down
89 changes: 87 additions & 2 deletions tests/loops/strategy-suite.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,27 @@
* or the conserved compute dose.
* - promotionGate: deterministic seeded verdict, minimum-evidence floor, CI margin.
*/
import { mkdtempSync } from 'node:fs'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { afterEach, describe, expect, it, vi } from 'vitest'
import { promotionGate } from '../../src/runtime/promotion-gate'
import type { BenchmarkReport, BenchmarkTaskRow } from '../../src/runtime/run-benchmark'
import {
type BenchmarkReport,
type BenchmarkTaskRow,
runBenchmark,
} from '../../src/runtime/run-benchmark'
import {
type AgenticSurface,
type AgenticTask,
defineStrategy,
runAgentic,
} from '../../src/runtime/strategy'
import { assertStrategyContract } from '../../src/runtime/strategy-author'
import {
assertStrategyContract,
authorStrategy,
strategyAuthorContract,
} from '../../src/runtime/strategy-author'

// ── Fixtures ──────────────────────────────────────────────────────────────────────

Expand Down Expand Up @@ -246,3 +257,77 @@ describe('promotionGate', () => {
expect(v.reason).toBe('no-margin')
})
})

// ── The author/optimizer addressability surface ───────────────────────────────────

describe('addressable optimization coordinates', () => {
it('the author contract exposes persona (multi-agent strategies are authorable)', () => {
expect(strategyAuthorContract).toContain('persona')
expect(strategyAuthorContract).toContain('systemPrompt')
})

it('analystModel routes the critique call to the critic model, not the worker', async () => {
const captured = stubRouter()
const surface = fixtureSurface(() => ({ passes: 0, total: 1 }))
const critiqued = defineStrategy('critiqued', async ({ shot, critique }) => {
const out = await shot()
if (out) await critique(out.messages)
return { score: 0, resolved: false, completions: 1, progression: [0], shots: 1 }
})
await runAgentic({
surface,
task,
...worker,
analystModel: 'critic-model',
strategy: critiqued,
budget: 2,
})
const models = captured.map((r) => (r as { model?: string }).model)
expect(models).toContain('test-model')
expect(models).toContain('critic-model')
})

it('runBenchmark passes lifecycle hooks through to every cell', async () => {
stubRouter()
const surface = fixtureSurface(() => ({ passes: 1, total: 1 }))
const events: string[] = []
const oneShot = defineStrategy('one-shot', async ({ shot }) => {
await shot()
return { score: 0, resolved: false, completions: 1, progression: [0], shots: 1 }
})
await runBenchmark({
environment: surface,
tasks: [task],
worker,
strategies: [oneShot],
budget: 1,
concurrency: 1,
hooks: { onEvent: (e) => void events.push(e.type) },
})
expect(events.length).toBeGreaterThan(0)
})

it('authorStrategy uses a caller-supplied contract (the meta-optimization coordinate)', async () => {
const dir = mkdtempSync(join(tmpdir(), 'authored-test-'))
const seen: string[] = []
const module = [
"export default { name: 'noop', driver: () => ({ name: 'noop', act: async () => ({ kind: 'done', deliverable: {} }) }) }",
].join('\n')
const chat = {
chat: async (req: { messages: Array<{ content: string }> }) => {
seen.push(req.messages.map((m) => m.content).join('\n'))
return { content: `\`\`\`ts\n${module}\n\`\`\`` }
},
} as unknown as Parameters<typeof authorStrategy>[0]['chat']
const { strategy } = await authorStrategy({
chat,
contract: 'CUSTOM CONTRACT vNEXT',
environmentName: 'fixture',
lossesJson: '[]',
budget: 2,
outDir: dir,
})
expect(seen.join('\n')).toContain('CUSTOM CONTRACT vNEXT')
expect(strategy.name).toBe('noop')
})
})
2 changes: 1 addition & 1 deletion vitest.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ import { defineConfig } from 'vitest/config'

export default defineConfig({
test: {
exclude: ['**/node_modules/**', 'dist/**', 'bench/**'],
exclude: ['**/node_modules/**', 'dist/**', 'bench/**', '**/.claude/worktrees/**'],
},
})
Loading