From 21608ef30c885adf1f644e38551b1ddf8218fa15 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 23 Jun 2026 16:26:15 -0600 Subject: [PATCH 1/2] docs(examples): prune + clarify the example set for tip-top DX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Land a developer in examples/, let them read ONE short example, and feel the power. Three confirmed defects fixed plus the centerpiece added. - DELETE examples/coder-loop/: its .ts was refactored off runLoop to worktreeLoopRunner, and its README still taught the deleted coderProfile export, so it no longer demonstrated "the same runLoop kernel" it claimed. researcher-loop is now the primary runLoop teacher. - ADD examples/driver-loop/: the centerpiece. A multi-round refine driver whose plan() READS the last worker's output from history and COMPOSES the next prompt FROM it — "the fold" every supervisor is built on, made visible with heavy plain-language comments. A second labeled section contrasts it with multishot so round vs shot sit side by side. Offline, e2e-proven (round 0 rejected -> driver folds -> round 1 passes via the corrected prompt). - IMPROVE stale-API docs and teaching comments: - mcp-delegation/README + fleet-delegation/README: replace the deleted delegate_code/delegate_research tools with the generic delegate verb + the MCP_ENABLE_DELEGATE=1 gate + the always-on feedback/status/history trio. - product-eval/README: teach runPersonaConversation (evalPersona was deleted in 0.76.0); the .ts was already correct. - researcher-loop / ui-audit / self-improving-loop / supervisor-loop: inline-define round and shot the first time each appears, and point at driver-loop/ as the example that actually shows the fold. - delegate: import from the @tangle-network/agent-runtime/loops subpath instead of a relative dist path; strip emoji from console output. - INDEX examples/README.md: rewrite as an ordered "use this when" path over tiers (cores -> driver/supervisor -> runLoop kernel -> production runtime -> self-improvement), add a vocabulary block, add the missing driver-loop / supervise / delegate / intelligence-drop-in rows, drop coder-loop. All edits stay inside examples/; build + typecheck + typecheck:examples + Biome lint are green, and driver-loop and researcher-loop run offline. --- examples/README.md | 159 +++++----- examples/coder-loop/README.md | 109 ------- examples/coder-loop/coder-loop.ts | 72 ----- examples/delegate/e2e-delegate-real.ts | 2 +- examples/driver-loop/README.md | 72 +++++ examples/driver-loop/driver-loop.ts | 279 ++++++++++++++++++ examples/fleet-delegation/README.md | 6 +- examples/mcp-delegation/README.md | 29 +- examples/product-eval/README.md | 19 +- examples/researcher-loop/README.md | 22 +- examples/researcher-loop/researcher-loop.ts | 5 + .../self-improving-loop.ts | 5 + examples/supervise/supervise.ts | 2 +- examples/supervisor-loop/run-bridge.ts | 4 +- examples/supervisor-loop/run-sandbox.ts | 25 +- .../supervisor-loop/run-supervisor-mcp.ts | 4 +- examples/supervisor-loop/shared.ts | 9 +- examples/ui-audit/README.md | 4 +- examples/ui-audit/ui-audit.ts | 4 + 19 files changed, 528 insertions(+), 303 deletions(-) delete mode 100644 examples/coder-loop/README.md delete mode 100644 examples/coder-loop/coder-loop.ts create mode 100644 examples/driver-loop/README.md create mode 100644 examples/driver-loop/driver-loop.ts diff --git a/examples/README.md b/examples/README.md index ca9a8bf4..084429f8 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,114 +1,135 @@ # agent-runtime examples -Ordered as a learning progression — each example introduces one concept on top of the previous one. The first three cover the package's three cores: the production chat/task runtime, the optimization suite, and the recursive Supervisor. The rest go deeper into each. +A learning path. Read the examples in order — each one adds a single concept on top of the last. +The fastest way to feel the package is to read **ONE** example: [`driver-loop/`](./driver-loop/) +(below), which shows the move every supervisor is built on. -Every example imports from `@tangle-network/agent-runtime` (the same surface consumers use), not from relative paths. All of them are typechecked by `pnpm run typecheck:examples` (wired into `pnpm run typecheck`). +Every example imports from `@tangle-network/agent-runtime` (the surface consumers use), not from +relative paths, and all of them are typechecked by `pnpm run typecheck:examples`. -Era tags: **production runtime** (`runAgentTask` / `handleChatTurn` — what every product runs), **loops suite** (`Environment` / `defineStrategy` / `runBenchmark` — the optimization layer), **supervisor core** (`Scope` / `Supervisor` / personify — the recursive atom; prefer it for new recursive work), **runLoop kernel** (the round-synchronous driver loop), **infra** (transports, MCP, observability). +## Vocabulary -## Start here — the three cores +These words appear in every example. The clearest demonstration of all of them is +[`driver-loop/`](./driver-loop/). -| # | Example | Era | One sentence | -|---|---|---|---| -| 1 | [`chat-handler/`](./chat-handler/) | production runtime | `handleChatTurn` — the production chat turn lifecycle every product runs | -| 2 | [`strategy-suite/`](./strategy-suite/) | loops suite | `Environment` + `defineStrategy` + `runBenchmark` — author and compare optimization strategies against your own check (needs `TANGLE_API_KEY`) | -| 3 | [`recursive-supervisor/`](./recursive-supervisor/) | supervisor core | One `Agent` spawning children through `scope.spawn` on a conserved budget pool, plus the `fanout` combinator (offline) | +- **round** — one driver cycle: `plan → run workers → decide` (the `runLoop` kernel runs this once per round). +- **shot** — one independent worker attempt/sample; **multishot** plays N shots in parallel. +- **sample** — best-of-N shots (breadth); **refine** — iterate-with-critique across rounds (depth). +- **the fold** — a driver reading the last worker's output and writing the next instruction *from* it. -## The production runtime, deeper +## Tier 0 — the three cores (read one, feel the power) -| # | Example | Era | One sentence | -|---|---|---|---| -| 4 | [`knowledge-gating/`](./knowledge-gating/) | production runtime | The minimal `AgentAdapter` + `requiredKnowledge` + readiness gating | -| 5 | [`sanitized-telemetry-streaming/`](./sanitized-telemetry-streaming/) | production runtime | Redaction-by-default telemetry collectors (streaming + non-streaming) | -| 6 | [`runtime-run/`](./runtime-run/) | production runtime | `startRuntimeRun` + cost ledger persistence | -| 7 | [`stream-backends/`](./stream-backends/) | infra | The three stream transports (iterable / sandbox / OpenAI-compatible) + SSE helpers, side by side | +| # | Example | Use this when… | +|---|---|---| +| 1 | [`chat-handler/`](./chat-handler/) | You're wiring a product's chat turn — the `handleChatTurn` lifecycle every product runs. | +| 2 | [`strategy-suite/`](./strategy-suite/) | You want to compare optimization strategies (sample vs refine vs your own) against your own pass/fail check (needs `TANGLE_API_KEY`). | +| 3 | [`recursive-supervisor/`](./recursive-supervisor/) | You want the raw recursive atom: one `Agent` spawning children on a conserved budget pool, shown twice (raw `scope.spawn` + the `fanout` combinator, offline). | -## Delegation + tools +## Tier 1 — the driver loop & supervisor (the heart of the product) -| # | Example | Era | One sentence | -|---|---|---|---| -| 8 | [`mcp-delegation/`](./mcp-delegation/) | infra | Mount `agent-runtime-mcp` in an `AgentProfile` — exposes `delegate_code`, `delegate_research`, `delegate_feedback`, `delegation_status`, `delegation_history` (plus `delegate_ui_audit` when a UI-audit runner is wired) | -| 9 | [`fleet-delegation/`](./fleet-delegation/) | infra | `TANGLE_FLEET_ID` flips delegation from sibling-sandbox to fleet-workspace topology | +| # | Example | Use this when… | +|---|---|---| +| 4 | [`driver-loop/`](./driver-loop/) | **You want to SEE the fold** — a driver reads the last worker's output and composes the next prompt from it (plan → run → decide → re-plan). The seam that makes everything else click. Offline. | +| 5 | [`supervise/`](./supervise/) | You want the one-call headline: `supervise(profile, goal)` — a router-brained supervisor with all scaffolding defaulted (needs `TANGLE_API_KEY`). | +| 6 | [`supervisor-loop/`](./supervisor-loop/) | You want that same supervisor over a real worker backend — sandbox box / local cli-bridge / coordination MCP — with the **worker backend as the only knob**. | +| 7 | [`delegate/`](./delegate/) | You want the one-call `delegate(intent)` proven e2e: a worker does real on-disk filesystem work, the gate settles only when the file exists, cost rides through (needs `TANGLE_API_KEY`). | -## The loops suite, deeper — search, evals, and the RSI verb +## Tier 2 — the runLoop kernel (the leaf the benches drive) -| # | Example | Era | One sentence | -|---|---|---|---| -| 9c | [`strategy-evolution/`](./strategy-evolution/) | loops suite | `runStrategyEvolution` + `promotionGate` — the policy-search journey: author candidate strategies from losses, advance a champion, promote on a fresh holdout slice (needs `TANGLE_API_KEY`) | -| 9d | [`product-eval/`](./product-eval/) | loops suite | `evalPersona` — user-sim product evals in one call: scripted + LLM-adversarial personas, plus the `runPersonaDispatch` → `runProfileMatrix` scored path (needs `TANGLE_API_KEY`; offline-testable via a `backendFor` override) | +The round-synchronous kernel: `driver.plan()` → N tasks → one sandbox per iteration → `output.parse` +→ `validator.validate` → `driver.decide`. The drivers below are single-round and content-blind on +purpose — read [`driver-loop/`](./driver-loop/) for the contrast (a driver that re-plans from output). -## The supervisor core, deeper — an agent drives N agents +| # | Example | Use this when… | +|---|---|---| +| 8 | [`researcher-loop/`](./researcher-loop/) | You want the canonical `runLoop` + inline fanout driver, with a validator that hard-fails a namespace leak so the kernel prunes the bad candidate (needs the optional `@tangle-network/agent-knowledge` peer). | +| 9 | [`ui-audit/`](./ui-audit/) | You want the smallest end-to-end `runLoop` over a real client (Playwright + stub judge), persisting findings. | -| # | Example | Era | One sentence | -|---|---|---|---| -| 9b | [`supervisor-loop/`](./supervisor-loop/) | supervisor core | One LLM SUPERVISOR (`driverAgent`) spawns + drives N worker agents to a checked completion on one conserved pool — the SAME code over `router-tools` / `sandbox` (a box) / `bridge` (local cli-bridge), swapping only the worker-leaf seam | +## Tier 3 — the production runtime, deeper -## The runLoop kernel (driver-planned fanout) +| # | Example | Use this when… | +|---|---|---| +| 10 | [`knowledge-gating/`](./knowledge-gating/) | You want readiness gating: the loop BLOCKS when a required-knowledge confidence is below threshold (also the smallest `runAgentTask`). | +| 11 | [`runtime-run/`](./runtime-run/) | You want the run-record + cost-ledger persistence lifecycle for dashboards. | +| 12 | [`stream-backends/`](./stream-backends/) | You want to pick a stream transport (iterable / sandbox / OpenAI-compatible) — the "pick your backend" map (OpenAI section needs `OPENAI_API_KEY`). | +| 13 | [`sanitized-telemetry-streaming/`](./sanitized-telemetry-streaming/) | You want redaction-by-default telemetry on the stream (and the `task.intent` PII footgun). | -The round-synchronous kernel: `driver.plan()` → N tasks → one sandbox per iteration → parse → validate → `driver.decide`. The drivers below are hand-written inline (`plan` + `decide` — two functions); for new recursive work prefer the supervisor core (#3). +## Tier 4 — delegation over MCP -| # | Example | Era | One sentence | -|---|---|---|---| -| 10 | [`coder-loop/`](./coder-loop/) | runLoop kernel | `coderProfile` + `runLoop` + an inline fanout driver — kernel picks the winner | -| 11 | [`researcher-loop/`](./researcher-loop/) | runLoop kernel | `researcherProfile` (from `@tangle-network/agent-knowledge/profiles`) + the namespace-leak hard-fail validator | -| 12 | [`ui-audit/`](./ui-audit/) | runLoop kernel | `uiAuditorProfile` + an in-process `SandboxClient` (Playwright + stub judge) + Markdown findings writer | +| # | Example | Use this when… | +|---|---|---| +| 14 | [`mcp-delegation/`](./mcp-delegation/) | You want to mount `agent-runtime-mcp` in an `AgentProfile`. Exposes the generic `delegate` verb (opt in with `MCP_ENABLE_DELEGATE=1`) plus the always-on `delegate_feedback` / `delegation_status` / `delegation_history` trio (and `delegate_ui_audit` when a UI-audit runner is wired). Needs `pnpm build` first. | +| 15 | [`fleet-delegation/`](./fleet-delegation/) | You want `TANGLE_FLEET_ID` to flip delegation from sibling-sandbox to fleet-workspace topology. | -## Self-improvement + observability +## Tier 5 — self-improvement & intelligence -| # | Example | Era | One sentence | -|---|---|---|---| -| 13 | [`self-improving-loop/`](./self-improving-loop/) | loops suite (pedagogical) | The v0 → judge → analyst → mutation → v1 → gate cycle, offline; production paths are `selfImprove` (agent-eval) and `runStrategyEvolution` (#2's subpath) | -| 13b | [`improve/`](./improve/) | loops suite | `improve(profile, findings, opts)` — the one pluggable RSI verb (held-out-gated surface optimization), offline with a scripted generator | -| 13c | [`intelligence-recommend/`](./intelligence-recommend/) | loops suite | The intelligence loop end to end, offline: `recordTrace` → derived `AnalystFinding`s → `improve()` → a gated candidate (the first example connecting the two halves) | -| 14 | [`agents-of-all-shapes/`](./agents-of-all-shapes/) | infra | Any framework's traces → one OTel GenAI contract → in-process `InsightReport` (the only example with a CI test) | +| # | Example | Use this when… | +|---|---|---| +| 16 | [`strategy-evolution/`](./strategy-evolution/) | You want the full policy-search + holdout gate: author candidates from losses, promote a champion only if a paired-bootstrap CI says it isn't luck (needs `TANGLE_API_KEY`). | +| 17 | [`improve/`](./improve/) | You want the one supported RSI verb: `improve(profile, findings, opts)` — optimize one profile surface, ship only if it clears the held-out gate. Offline. | +| 18 | [`self-improving-loop/`](./self-improving-loop/) | You want the unrolled internals of #17: v0 → judge → analyst → mutation → v1 → gate, with the "which substrate owns each phase" map. Offline. | +| 19 | [`intelligence-recommend/`](./intelligence-recommend/) | You want the intelligence loop offline: trace → findings → `improve()` → gated candidate. | +| 20 | [`intelligence-drop-in/`](./intelligence-drop-in/) | You want to wrap any agent with `withTangleIntelligence` and ship one trace per call (best-effort; off = passthrough). | +| 21 | [`agents-of-all-shapes/`](./agents-of-all-shapes/) | You want proof that any framework's traces converge on one OTel contract → one `InsightReport` (the CI-tested example). | +| 22 | [`product-eval/`](./product-eval/) | You want user-sim product evals: a persona over a multi-round conversation via `runPersonaConversation`, then score the transcript (`maxTurns` is a ceiling, not a target). Needs `TANGLE_API_KEY`; offline via a `backendFor` override. | ## Conventions -- Examples are synthetic unless noted. `strategy-suite`, `strategy-evolution`, and `product-eval` need `TANGLE_API_KEY` (`improve` and `intelligence-recommend` run fully offline); `stream-backends`' OpenAI section needs `OPENAI_API_KEY` (the rest of it runs offline); `mcp-delegation` needs `pnpm build` first so the local MCP bin exists; `researcher-loop` needs the optional `@tangle-network/agent-knowledge` peer. -- Where domain types are needed (`SandboxBox`, evidence stores), the example defines them inline — comments call out which parts are *yours* to provide vs *the runtime's* contract. -- No example creates its own throwaway `package.json` — they run from this repo's tsx so changes to the runtime are picked up immediately. +- Examples are synthetic unless noted. `strategy-suite`, `strategy-evolution`, `product-eval`, + `supervise`, and `delegate` need `TANGLE_API_KEY`; `stream-backends`' OpenAI section needs + `OPENAI_API_KEY` (the rest runs offline); `mcp-delegation` needs `pnpm build` first so the local + MCP bin exists; `researcher-loop` needs the optional `@tangle-network/agent-knowledge` peer. + Everything else runs fully offline. +- Where domain types are needed (`SandboxBox`, evidence stores), the example defines them inline — + comments call out which parts are *yours* to provide vs *the runtime's* contract. +- No example creates its own throwaway `package.json` — they run from this repo's tsx so changes to + the runtime are picked up immediately. ## Run -From the agent-runtime repo root, in the suggested learning order: +From the agent-runtime repo root, in the learning order above: ```bash -# The three cores +# Tier 0 — the three cores pnpm tsx examples/chat-handler/chat-handler.ts TANGLE_API_KEY=... pnpm tsx examples/strategy-suite/strategy-suite.ts pnpm tsx examples/recursive-supervisor/recursive-supervisor.ts -# Production runtime, deeper +# Tier 1 — driver loop & supervisor (the heart) +pnpm tsx examples/driver-loop/driver-loop.ts # SEE THE FOLD (offline) +TANGLE_API_KEY=... pnpm tsx examples/supervise/supervise.ts # the one-call supervisor +WORKER_MODEL=opencode/anthropic/claude-sonnet-4-5 \ + pnpm tsx examples/supervisor-loop/run-bridge.ts # same supervisor, local cli-bridge backend +TANGLE_API_KEY=... pnpm tsx examples/delegate/e2e-delegate-real.ts # delegate(intent), e2e + +# Tier 2 — the runLoop kernel +pnpm tsx examples/researcher-loop/researcher-loop.ts +pnpm dlx tsx examples/ui-audit/ui-audit.ts /tmp/ui-audit-demo https://example.com + +# Tier 3 — production runtime, deeper pnpm tsx examples/knowledge-gating/knowledge-gating.ts -pnpm tsx examples/sanitized-telemetry-streaming/sanitized-telemetry-streaming.ts pnpm tsx examples/runtime-run/runtime-run.ts pnpm tsx examples/stream-backends/stream-backends.ts +pnpm tsx examples/sanitized-telemetry-streaming/sanitized-telemetry-streaming.ts -# Delegation +# Tier 4 — delegation over MCP pnpm build # mcp-delegation needs dist/mcp/bin.js pnpm tsx examples/mcp-delegation/mcp-delegation.ts pnpm tsx examples/fleet-delegation/fleet-delegation.ts -# Supervisor core, deeper — one agent drives N workers (bridge = local cli-bridge path) -TANGLE_API_KEY=... pnpm tsx examples/supervise/supervise.ts # router brain + router-tools workers (the one-call entry) -WORKER_MODEL=opencode/anthropic/claude-sonnet-4-5 pnpm tsx examples/supervisor-loop/run-bridge.ts # local harness CLIs via ~/code/cli-bridge - -# runLoop kernel -pnpm tsx examples/coder-loop/coder-loop.ts -pnpm tsx examples/researcher-loop/researcher-loop.ts -pnpm dlx tsx examples/ui-audit/ui-audit.ts /tmp/ui-audit-demo https://example.com - -# The loops suite, deeper — search + evals -TANGLE_API_KEY=... pnpm tsx examples/strategy-evolution/strategy-evolution.ts # policy search → holdout gate -TANGLE_API_KEY=... pnpm tsx examples/product-eval/product-eval.ts # user-sim product evals (evalPersona) - -# Self-improvement + observability +# Tier 5 — self-improvement & intelligence +TANGLE_API_KEY=... pnpm tsx examples/strategy-evolution/strategy-evolution.ts +pnpm tsx examples/improve/improve.ts pnpm tsx examples/self-improving-loop/self-improving-loop.ts -pnpm tsx examples/improve/improve.ts # improve() — the RSI verb (offline) -pnpm tsx examples/intelligence-recommend/intelligence-recommend.ts # traces → findings → improve() (offline) +pnpm tsx examples/intelligence-recommend/intelligence-recommend.ts +pnpm tsx examples/intelligence-drop-in/intelligence-drop-in.ts pnpm tsx examples/agents-of-all-shapes/run.ts +TANGLE_API_KEY=... pnpm tsx examples/product-eval/product-eval.ts ``` ## Tracing -The kernels emit `loop.*` trace events as they run; with `OTEL_EXPORTER_OTLP_ENDPOINT` set they export as OTel GenAI spans (see the root README § Tracing). `agents-of-all-shapes/` (#14) shows the full traces → insights pipe; the `agent-stack-adoption` skill documents the end-to-end production ingestion pipeline. +The kernels emit `loop.*` trace events as they run; with `OTEL_EXPORTER_OTLP_ENDPOINT` set they +export as OTel GenAI spans (see the root README § Tracing). `agents-of-all-shapes/` (#21) shows the +full traces → insights pipe; the `agent-stack-adoption` skill documents the end-to-end production +ingestion pipeline. diff --git a/examples/coder-loop/README.md b/examples/coder-loop/README.md deleted file mode 100644 index ba3f278c..00000000 --- a/examples/coder-loop/README.md +++ /dev/null @@ -1,109 +0,0 @@ -# coder-loop - -`coderProfile()` + `runLoop()` + an inline fanout `Driver` — the smallest -end-to-end coder loop. Two parallel iterations attempt the same goal; the -validator scores test + typecheck + diff size; the kernel picks the -highest-scoring valid winner. - -`runLoop` is the round-synchronous kernel: `driver.plan()` → N tasks → one -sandbox per iteration → `output.parse` → `validator.validate` → -`driver.decide`. For new recursive/multi-level work, prefer the reactive -`Scope`/`Supervisor` core and the personify combinators (`fanout` does this -example's topology generically) — see -[`examples/recursive-supervisor/`](../recursive-supervisor/). - -## Topology - -The driver is ~5 lines, hand-written in `coder-loop.ts`: a single-round -fanout whose `plan()` returns two copies of the task only when `history` is -empty (round 0), then `[]` forever after — it spawns N, scores, and picks; it -never refines. Each of the N tasks becomes its own iteration, and every -iteration runs the same `output.parse` → `validator.validate` pipeline -independently before the driver votes. - -```mermaid -flowchart TD - task["CoderTask\ngoal: add util.ts add(a,b)"] --> plan0 - - subgraph round0["round 0 — driver.plan(task, history=[])"] - plan0["inline fanout driver\nreturns [task, task]"] - end - - plan0 --> reserve["kernel reserves 2 iteration slots\nrunBatch dispatches in parallel\n(bounded by maxConcurrency)"] - - reserve --> wA - reserve --> wB - - subgraph A["iteration 0 — worker A"] - direction TB - wA["sandboxClient.create()\n→ box.streamPrompt()"] --> evA["events:\nllm_call (costUsd 0.0036)\nresult { branch util-add-A }"] - evA --> parseA["output.parse → CoderOutput\ntyped arrow fn\nexport const add = (a:number,b:number):number"] - parseA --> valA["validator.validate\ntests pass · typecheck PASS\ndiff 2 ≤ 50 · no forbidden paths"] - valA --> verA["DefaultVerdict\nvalid = true · score ≈ 0.992"] - end - - subgraph B["iteration 1 — worker B"] - direction TB - wB["sandboxClient.create()\n→ box.streamPrompt()"] --> evB["events:\nllm_call (costUsd 0.0036)\nresult { branch util-add-B }"] - evB --> parseB["output.parse → CoderOutput\nuntyped params\nexport function add(a, b)"] - parseB --> valB["validator.validate\ntests pass · typecheck FAIL (TS7006)\ndiff 3 ≤ 50"] - valB --> verB["DefaultVerdict\nvalid = false · rejected"] - end - - verA --> plan1 - verB --> plan1 - - subgraph round1["round 1 — driver.plan(task, history=[2 done])"] - plan1["returns []\nmoveKind = stop (no refine)"] - end - - plan1 --> decide["driver.decide(history)\ndefaultSelector: filter valid,\nsort by verdict.score desc,\ntie-break iterationIndex asc"] - - decide --> winner["decision = pick-winner\nwinner = iteration 0 (A)"] - verB -.->|invalid, dropped| decide - - verA -.->|costUsd 0.0036| cost - verB -.->|costUsd 0.0036| cost - cost["result.costUsd = 0.0072\n(sum of per-iteration costUsd)"] - winner --> cost - - classDef win fill:#1b5e20,stroke:#2e7d32,color:#fff - classDef lose fill:#5d1a1a,stroke:#b71c1c,color:#fff - class verA,winner win - class verB lose -``` - -## Run - -```bash -pnpm tsx examples/coder-loop/coder-loop.ts -``` - -## What it shows - -- How `coderProfile({ task, harness })` bundles `profile`, `taskToPrompt`, - `output` (event-stream → `CoderOutput`), `validator` (test + typecheck + - diff cap + forbidden-path enforcement), and `agentRunSpec` together. -- How a hand-written `Driver` (`plan` + `decide`) makes the kernel plan N - parallel iterations and pick the winning output — the whole `Driver` - contract is two functions. -- How the synthetic `sandboxClient` mirrors the production - `@tangle-network/sandbox` `Sandbox` surface — swap it for `new Sandbox(...)` - when you wire to production. -- How `result.winner` carries the typed `CoderOutput`, the verdict, and the - iteration index — everything you need to merge the patch in CI. - -## Wire to production - -Swap the synthetic `sandboxClient` for: - -```ts -import { Sandbox } from '@tangle-network/sandbox' - -const sandboxClient = new Sandbox({ apiKey: process.env.TANGLE_API_KEY! }) -``` - -Then `runLoop` creates a fresh sandbox per iteration via `sandboxClient.create()` -and streams the prompt through `box.streamPrompt(taskToPrompt(task))`. Each -iteration's events feed the same `output.parse` → `validator.validate` -pipeline. diff --git a/examples/coder-loop/coder-loop.ts b/examples/coder-loop/coder-loop.ts deleted file mode 100644 index 9cbacb52..00000000 --- a/examples/coder-loop/coder-loop.ts +++ /dev/null @@ -1,72 +0,0 @@ -// worktreeLoopRunner — the smallest end-to-end coder loop on the generic recursive path: -// author one AgentProfile per harness, fan them out over worktree-CLI leaves, gate each on -// patchDelivered, and pick the winning patch with the shared valid-only selector. See README.md. - -import { worktreeLoopRunner } from '@tangle-network/agent-runtime' -import type { AgentProfile } from '@tangle-network/sandbox' - -const profile = (name: string): AgentProfile => ({ - name, - prompt: { systemPrompt: `You are ${name}. Deliver a minimal, correct patch.` }, -}) - -// ── Offline test seams ─────────────────────────────────────────────────── -// A fake git that hands every worktree the same one-line patch, a no-op harness -// runner, and a passing check runner. Production callers leave these unset (the -// runner drives the real claude/codex/opencode CLIs on real worktrees). -const patch = [ - 'diff --git a/util.ts b/util.ts', - '--- a/util.ts', - '+++ b/util.ts', - '+export const add = (a: number, b: number): number => a + b', -].join('\n') - -async function main(): Promise { - const runner = worktreeLoopRunner({ - repoRoot: '/tmp/coder-loop-example', - taskPrompt: 'add util.ts that exports add(a, b)', - budget: { maxIterations: 50, maxTokens: 500_000 }, - harnesses: [ - { name: 'claude', profile: profile('claude'), harness: 'claude' }, - { name: 'opencode', profile: profile('opencode'), harness: 'opencode' }, - ], - testCmd: 'node -e \'require("./util").add(1,2)===3 || process.exit(1)\'', - typecheckCmd: 'pnpm typecheck', - require: ['tests', 'typecheck'], - maxDiffLines: 50, - forbiddenPaths: ['secrets/', 'node_modules/'], - runGit: (args: readonly string[]) => { - if (args[0] === 'diff' && args.includes('--shortstat')) { - return { - stdout: ' 1 file changed, 1 insertion(+), 0 deletions(-)\n', - stderr: '', - exitCode: 0, - } - } - if (args[0] === 'diff') return { stdout: patch, stderr: '', exitCode: 0 } - if (args[0] === 'rev-parse') return { stdout: 'base\n', stderr: '', exitCode: 0 } - return { stdout: '', stderr: '', exitCode: 0 } - }, - runHarness: async () => ({ - exitCode: 0, - stdout: 'done', - stderr: '', - killedBySignal: null, - durationMs: 1, - timedOut: false, - }), - runCommand: async () => ({ exitCode: 0, output: 'green' }), - }) - - const winner = await runner(new AbortController().signal) - console.log(`winning branch: ${winner.branch}`) - console.log(` diff (${winner.stats.insertions} insertions):`) - for (const line of winner.patch.split('\n')) console.log(` ${line}`) - console.log(` tests passed: ${winner.checks?.tests?.passed ?? '(not run)'}`) - console.log(` typecheck passed: ${winner.checks?.typecheck?.passed ?? '(not run)'}`) -} - -main().catch((err) => { - console.error(err) - process.exit(1) -}) diff --git a/examples/delegate/e2e-delegate-real.ts b/examples/delegate/e2e-delegate-real.ts index 2608352c..60f9823c 100644 --- a/examples/delegate/e2e-delegate-real.ts +++ b/examples/delegate/e2e-delegate-real.ts @@ -11,7 +11,7 @@ import { existsSync, mkdirSync, mkdtempSync, readFileSync, writeFileSync } from 'node:fs' import { tmpdir } from 'node:os' import { dirname, join, resolve } from 'node:path' -import { delegate, type ExecutorConfig } from '../../dist/loops.js' +import { delegate, type ExecutorConfig } from '@tangle-network/agent-runtime/loops' const routerBaseUrl = process.env.TANGLE_ROUTER_URL ?? 'https://router.tangle.tools/v1' const routerKey = process.env.TANGLE_API_KEY diff --git a/examples/driver-loop/README.md b/examples/driver-loop/README.md new file mode 100644 index 00000000..cb38b5ac --- /dev/null +++ b/examples/driver-loop/README.md @@ -0,0 +1,72 @@ +# driver-loop + +**See the fold.** This is the single most important example in the set: a driver that +*reads the last worker's output and writes the next instruction from it*. That read-then-rewrite +move — "the fold" — is what every supervisor in this repo is built on. Once you've seen it here, +`supervise()`, the coordination MCP, and the self-improvement loop all read as variations of it. + +Runs fully offline (a scripted worker, no credentials): + +```bash +pnpm tsx examples/driver-loop/driver-loop.ts +``` + +## Vocabulary + +These words are used across every example and defined here. + +| Term | Meaning | +|---|---| +| **round** | One full driver cycle: `plan → run workers → decide`. The `runLoop` kernel runs exactly this, once per round. | +| **shot** | One independent worker attempt/sample. A single round can run many shots (a fanout). | +| **multishot** | N shots played in parallel. | +| **sample** | A strategy: take the best of N shots (breadth). | +| **refine** | A strategy: iterate-with-critique *across rounds* (depth) — what SECTION 1 of this example does. | + +## What the example shows + +**SECTION 1 — ROUNDS (refine), the centerpiece.** A multi-round driver: + +- **Round 0** — `driver.plan(task, history=[])`: no history yet, so it runs the worker once. The + worker drafts a release note but forgets a required word, so the validator **rejects** it. +- **Round 1** — `driver.plan(task, history=[1 rejected])`: the driver READS the rejected draft + and its verdict out of `history`, then COMPOSES a corrective prompt *from that output* ("your + draft was X, it was rejected because Y — rewrite it to mention Z"). The worker obeys the new + prompt and the validator **passes**. + +The two load-bearing lines in `driver-loop.ts` are commented `THE FOLD, PART 1: INGEST` (where it +reads `history[history.length-1].output`) and `THE FOLD, PART 2: GENERATE` (where it builds the +next prompt). In production a router LLM does that composition — it reads the folded worker output +from its tool-result messages and writes the next spawn's prompt. Here it's plain code so the seam +is visible. + +```mermaid +flowchart TD + task["NoteTask\nprompt: draft a release note"] --> plan0 + subgraph r0["ROUND 0 — plan(task, history=[])"] + plan0["driver runs the worker once"] + end + plan0 --> w0["worker → 'Shipped one-click restore for failed deploys.'"] + w0 --> v0{"validator: mentions 'rollback'?"} + v0 -->|no — REJECT| fold["THE FOLD\ndriver reads the rejected draft\n+ builds a corrective prompt from it"] + subgraph r1["ROUND 1 — plan(task, history=[1 rejected])"] + fold + end + fold --> w1["worker → '…with an instant rollback path…'"] + w1 --> v1{"validator: mentions 'rollback'?"} + v1 -->|yes — PASS| done["decide → pick-winner"] +``` + +**SECTION 2 — SHOTS (multishot), the contrast.** Three independent attempts at the same task, +in parallel, with **no fold between them**. This is the *other* axis: a round refines depth-wise +(each round improves on the last); a shot explores breadth-wise (many tries at once). Seeing them +side by side is the cleanest way to internalize round vs shot. + +## Where this goes next + +- `examples/supervise/` — the one-call `supervise(profile, goal)` where a router LLM does the fold + for you. +- `examples/supervisor-loop/` — the same supervisor over a real worker backend (sandbox box / + local cli-bridge), worker backend as the only knob. +- `examples/researcher-loop/` and `examples/ui-audit/` — `runLoop` drivers that are *single-round* + and *content-blind* on purpose (they never fold); read those to see the contrast with this one. diff --git a/examples/driver-loop/driver-loop.ts b/examples/driver-loop/driver-loop.ts new file mode 100644 index 00000000..6044a3bd --- /dev/null +++ b/examples/driver-loop/driver-loop.ts @@ -0,0 +1,279 @@ +/** + * driver-loop — SEE THE FOLD. + * + * This is the one concept that makes the whole supervisor/driver story click: a driver + * does not just count iterations. It READS the last worker's actual output and WRITES the + * next instruction FROM that output. That read-then-rewrite is "the fold". Everything else + * in this repo — supervise(), the coordination MCP, the self-improvement loop — is built on + * top of this single move. + * + * ── Vocabulary (used everywhere, defined here) ────────────────────────────────────────── + * • round — one full driver cycle: plan → run workers → decide. The `runLoop` kernel + * calls plan(), runs the planned workers, then calls decide(), once per round. + * • shot — one independent worker attempt/sample. A round can run many shots (a fanout). + * • multishot — N shots played in parallel (see SECTION 2 below). + * • sample — a strategy: take the best of N shots (breadth). + * • refine — a strategy: iterate-with-critique ACROSS rounds (depth) — this file's SECTION 1. + * + * SECTION 1 (the centerpiece) is a multi-ROUND refine driver. Round 0 asks the worker to draft + * a release note; the validator rejects it for missing a required word; the driver READS that + * rejected draft and BUILDS a corrective prompt from it; round 1 re-runs with that prompt and + * passes. SECTION 2 contrasts it with a multi-SHOT run so the two axes sit side by side. + * + * Fully offline — the worker is a scripted client keyed on the prompt, so it runs with zero + * credentials (the same offline pattern self-improving-loop uses). + * + * Run: pnpm tsx examples/driver-loop/driver-loop.ts + */ + +import { + type MultishotPersona, + type MultishotShape, + runMultishot, +} from '@tangle-network/agent-eval/multishot' +import { + type DefaultVerdict, + type Driver, + type OutputAdapter, + runLoop, + type Validator, +} from '@tangle-network/agent-runtime/loops' +import type { AgentProfile, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox' + +// ── The task + what "good" means ──────────────────────────────────────────────────────── +// The agent must draft a one-line release note that mentions the word "rollback". A real +// product would validate something richer; the required word keeps the example deterministic. +interface NoteTask { + feature: string + /** The next instruction the worker should run. The DRIVER rewrites this between rounds. */ + prompt: string +} +interface NoteOutput { + note: string +} +const requiredWord = 'rollback' + +// ── The worker (scripted, offline) ────────────────────────────────────────────────────── +// A worker is just something that takes a prompt and streams back events. Here we fake it: +// the FIRST prompt produces a draft that forgets the required word (so it will be rejected); +// any prompt that mentions the required word produces a corrected draft. That keyed behavior +// is what lets the example PROVE the fold worked: round 1 only passes because the driver put +// the right correction into the prompt. +function scriptedWorkerClient(): { create(): Promise } { + return { + async create(): Promise { + return { + id: `worker-${Math.random().toString(36).slice(2, 8)}`, + async *streamPrompt(prompt: string): AsyncIterable { + yield { + type: 'llm_call', + data: { model: 'scripted', tokensIn: 200, tokensOut: 40, costUsd: 0.0006 }, + } + // The worker "obeys" the prompt: if the driver's corrective prompt told it to + // mention the required word, it does; otherwise it ships the naive first draft. + const note = prompt.toLowerCase().includes(requiredWord) + ? 'Shipped one-click restore with an instant rollback path if a deploy goes bad.' + : 'Shipped one-click restore for failed deploys.' + yield { type: 'result', data: { result: { note } satisfies NoteOutput } } + }, + } as unknown as SandboxInstance + }, + } +} + +// ── The output adapter: raw event stream → typed output ───────────────────────────────── +const output: OutputAdapter = { + parse(events: SandboxEvent[]): NoteOutput { + for (const ev of events) { + if (ev.type === 'result') { + const r = (ev as { data?: { result?: unknown } }).data?.result + if (r && typeof r === 'object' && 'note' in r) return r as NoteOutput + } + } + return { note: '' } + }, +} + +// ── The validator: the pass/fail check the driver reads to decide whether to refine ────── +const validator: Validator = { + validate(out: NoteOutput): Promise { + const valid = out.note.toLowerCase().includes(requiredWord) + return Promise.resolve({ + valid, + score: valid ? 1 : 0, + notes: valid ? 'mentions rollback' : `missing required word "${requiredWord}"`, + }) + }, +} + +// ── THE DRIVER — this is the example ──────────────────────────────────────────────────── +// A driver is two functions: plan() (what to run this round) and decide() (are we done?). +// The fold lives inside plan(): on round > 0 it READS history (the last worker's real output +// + its verdict) and COMPOSES the next prompt FROM that output. +// +// Decision values: the kernel STOPS the loop when decide() returns a TERMINAL value +// ('stop' | 'pick-winner' | 'fail' | 'done'). Any other string is non-terminal → the loop +// runs another round. That's the footgun for a refine driver: if decide() returned 'fail' +// after a failing round 0, the loop would stop BEFORE it ever got to refine. So we return the +// non-terminal 'refine' to keep going, and only the terminal 'pick-winner'/'fail' when truly done. +type NoteDecision = 'refine' | 'pick-winner' | 'fail' + +function refineDriver(maxRounds: number): Driver { + return { + name: 'refine', + async plan(task, history) { + // ROUND 0 — no history yet, so just run the initial task once. + if (history.length === 0) return [task] + + // We already passed? Stop refining (return [] → no more workers this round). + const last = history[history.length - 1] + if (last?.verdict?.valid) return [] + + // Round cap: stop even if still failing. + if (history.length >= maxRounds) return [] + + // ── THE FOLD, PART 1: INGEST the last worker's actual output ──────────────────────── + // `history[history.length - 1].output` is the real answer the previous worker produced; + // `.verdict` is how it scored. This read is what separates a driver from a counter. + const draft = last?.output?.note ?? '(empty draft)' + const why = last?.verdict?.notes ?? 'failed validation' + + // ── THE FOLD, PART 2: GENERATE the next prompt FROM that output ────────────────────── + // We build the NEXT instruction out of what we just read. In a real supervisor a router + // LLM does this composition (it reads the folded worker output via its tool-result + // messages and writes the next spawn's prompt); here we do it in plain code so the seam + // is visible. The corrective prompt deliberately names the required word so the scripted + // worker can obey it — proving the loop's behavior changed BECAUSE of the fold. + const correctedPrompt = + `Your previous draft was: "${draft}". It was rejected because ${why}. ` + + `Rewrite the release note for "${task.feature}" so it explicitly mentions the ` + + `${requiredWord} path. Keep it to one line.` + + return [{ ...task, prompt: correctedPrompt }] + }, + + // decide() runs after each round, AND once more when plan() returns [] (the finalize pass). + // • a valid winner exists → 'pick-winner' (terminal: we're done, ship it) + // • no winner but rounds remain → 'refine' (NON-terminal: loop runs plan() again) + // • no winner and out of rounds → 'fail' (terminal: give up) + decide(history): NoteDecision { + if (history.some((it) => it.verdict?.valid)) return 'pick-winner' + return history.length < maxRounds ? 'refine' : 'fail' + }, + } +} + +// ── SECTION 1: run the refine (multi-round) driver ────────────────────────────────────── +async function runRefine(): Promise { + console.log('── SECTION 1 · ROUNDS (refine) — driver reads worker output, rewrites the prompt') + + const task: NoteTask = { + feature: 'one-click restore', + prompt: 'Write a one-line release note for the one-click restore feature.', + } + + const result = await runLoop({ + driver: refineDriver(3), + agentRun: { + profile: { name: 'note-writer' } as AgentProfile, + // Each round's task carries the prompt the driver authored; this is how the rewritten + // instruction actually reaches the worker. + taskToPrompt: (t) => t.prompt, + }, + output, + validator, + task, + ctx: { sandboxClient: scriptedWorkerClient() }, + maxIterations: 5, + }) + + // One iteration == one round here (the driver runs a single worker per round). + for (const it of result.iterations) { + const verdict = it.verdict?.valid ? 'PASS' : 'reject' + console.log(` ROUND ${it.index}: [${verdict}] note = "${it.output?.note ?? ''}"`) + if (!it.verdict?.valid && it.index < result.iterations.length - 1) { + console.log(' └─ driver folds this rejected output into round', it.index + 1) + } + } + console.log(` decision: ${result.decision}`) + if (result.winner) console.log(` winner: round ${result.winner.iterationIndex}`) + console.log() +} + +// ── SECTION 2: contrast — SHOTS (multishot), the OTHER axis ────────────────────────────── +// A round refines DEPTH-wise (each round improves on the last). A shot explores BREADTH-wise: +// N independent attempts at the SAME task, in parallel, no fold between them. runMultishot is +// the substrate primitive for that. We run it with a mocked router so it stays offline. +interface SimplePersona extends MultishotPersona { + id: string +} +async function runShots(): Promise { + console.log('── SECTION 2 · SHOTS (multishot) — N independent attempts, no fold between them') + + const restore = installMockRouter([ + { text: 'Attempt A: one-click restore with a rollback path.' }, + { text: 'Attempt B: one-click restore, instant rollback if a deploy fails.' }, + { text: 'Attempt C: one-click restore; rollback included.' }, + ]) + process.env.TANGLE_API_KEY ??= 'test-key' + try { + const profile: AgentProfile = { + name: 'note-writer', + prompt: { systemPrompt: 'Write a one-line release note that mentions rollback.' }, + } + const shape: MultishotShape = { + buildOpener: () => 'Write the release note.', + buildDriverSystemPrompt: () => 'You are drafting a release note.', + } + // Three personas == three shots; they run independently. There is no round-to-round fold. + for (const id of ['shot-0', 'shot-1', 'shot-2']) { + const res = await runMultishot({ + profile, + persona: { id } as SimplePersona, + shape, + maxTurns: 1, + }) + // Grab the worker's reply: the last non-user, non-tool message in the transcript. + const reply = [...res.transcript] + .reverse() + .find((m) => m.role !== 'user' && m.role !== 'tool') + console.log(` ${id} (parallel): "${reply?.content ?? ''}"`) + } + } finally { + restore() + } + console.log() + console.log(' ROUND vs SHOT: a round folds the last output into the next prompt (depth);') + console.log(' a shot is one independent attempt; multishot plays N shots at once (breadth).') +} + +// Minimal offline router stub (same pattern as self-improving-loop) so SECTION 2 needs no creds. +function installMockRouter(replies: Array<{ text: string }>): () => void { + const original = global.fetch + let i = 0 + global.fetch = (async () => { + const r = replies[i++ % replies.length] + return { + ok: true, + status: 200, + json: async () => ({ + choices: [{ message: { content: r?.text ?? '' } }], + usage: { prompt_tokens: 80, completion_tokens: 20 }, + }), + text: async () => 'ok', + } as Response + }) as typeof fetch + return () => { + global.fetch = original + } +} + +async function main(): Promise { + await runRefine() + await runShots() +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/examples/fleet-delegation/README.md b/examples/fleet-delegation/README.md index 125345e7..eccdefc6 100644 --- a/examples/fleet-delegation/README.md +++ b/examples/fleet-delegation/README.md @@ -77,9 +77,9 @@ flowchart TD end ``` -- **Sibling** (default): each `delegate_code` / `delegate_research` spawns - a fresh sandbox via `sandboxClient.create()`. Worker output flows back - through the MCP response — there is no shared filesystem. +- **Sibling** (default): each `delegate` call spawns a fresh sandbox via + `sandboxClient.create()`. Worker output flows back through the MCP + response — there is no shared filesystem. - **Fleet** (set `TANGLE_FLEET_ID`): each delegation lands on an existing machine in the parent fleet. The fleet's shared-workspace policy means the worker sees the caller's filesystem and any diff lands in-place. diff --git a/examples/mcp-delegation/README.md b/examples/mcp-delegation/README.md index 5d24af73..0b9c16b5 100644 --- a/examples/mcp-delegation/README.md +++ b/examples/mcp-delegation/README.md @@ -1,7 +1,7 @@ # mcp-delegation How a product mounts the `agent-runtime-mcp` server into its `AgentProfile`, -plus a tiny stdio JSON-RPC client that proves the server exposes all five +plus a tiny stdio JSON-RPC client that proves the server exposes the delegation tools. ## Run @@ -14,22 +14,26 @@ pnpm tsx examples/mcp-delegation/mcp-delegation.ts The first block prints the `mcp['agent-runtime-delegation']` entry a product passes to `sandboxClient.create({ backend: { profile } })`. The second block spawns the locally-built `dist/mcp/bin.js`, calls -`tools/list` over stdio JSON-RPC, and asserts the five canonical tools -are present. +`tools/list` over stdio JSON-RPC, and asserts the always-on tools are +present. ## What it shows - The literal `AgentProfileMcpServer` shape consumers paste into their own product's profile composer. - The bin's expected env: `TANGLE_API_KEY` for live delegations, + `MCP_ENABLE_DELEGATE=1` to opt the generic `delegate` verb in, and `AGENT_RUNTIME_MCP_ALLOW_NO_KEY=1` for the diagnostic mode the smoke leg uses when no key is set. -- The five canonical tools every consumer expects: - - `delegate_code` — async coder dispatch - - `delegate_research` — async researcher dispatch - - `delegate_feedback` — append-only rating store - - `delegation_status` — poll for `pending` / `running` / `completed` - - `delegation_history` — read past delegations newest-first +- The delegation tools: + - `delegate` — the ONE generic verb: a supervisor that authors + drives its + own worker and returns the delivered output with its real spend. Replaces + the old hardcoded `delegate_code` / `delegate_research`. Registers ONLY when + `MCP_ENABLE_DELEGATE=1` AND a real sandbox key resolves. + - `delegate_feedback` — append-only rating store (always on) + - `delegation_status` — poll for `pending` / `running` / `completed` (always on) + - `delegation_history` — read past delegations newest-first (always on) + - `delegate_ui_audit` — served only when a UI-audit runner is wired in ## Production wiring @@ -46,6 +50,7 @@ const profile: AgentProfile = { env: { TANGLE_API_KEY: process.env.TANGLE_API_KEY!, SANDBOX_BASE_URL: 'https://sandbox.tangle.tools', + MCP_ENABLE_DELEGATE: '1', // opt the generic `delegate` verb in (off by default) }, enabled: true, }, @@ -54,8 +59,10 @@ const profile: AgentProfile = { ``` Pass `profile` to `sandboxClient.create({ backend: { profile } })`. The -sandbox-side agent harness now sees the five delegation tools mid-turn, -and can fan work out to coders/researchers without blocking the chat. +sandbox-side agent harness now sees the delegation tools mid-turn, and can +fan work out via the generic `delegate` verb without blocking the chat. +Omit `MCP_ENABLE_DELEGATE` and only the always-on trio +(`delegate_feedback` / `delegation_status` / `delegation_history`) is exposed. See [`fleet-delegation`](../fleet-delegation/) for the multi-machine variant where delegations dispatch into a shared workspace instead of diff --git a/examples/product-eval/README.md b/examples/product-eval/README.md index 2ad594de..7c228ed0 100644 --- a/examples/product-eval/README.md +++ b/examples/product-eval/README.md @@ -1,12 +1,13 @@ # product-eval -User-sim product evals in one call — `evalPersona`, plus the `runPersonaDispatch` → matrix path. +User-sim product evals — `runPersonaConversation` (the persona loop) + the +`runPersonaDispatch` → matrix path. -A product eval runs the **agent under test** against a **persona** (a simulated user) over a -multi-round conversation, then scores the transcript. `evalPersona(worker, persona, opts)` is the -one-call entry — you author a worker `AgentProfile` and a persona, and it defaults the two seams -`runPersonaConversation` otherwise makes you hand-wire: the backend (from `opts.{apiKey,baseUrl,model}`) -and the system prompt (`p.prompt?.systemPrompt`). +A product eval runs the **agent under test** against a **persona** (a simulated +user) over a multi-round conversation, then scores the transcript. +`runPersonaConversation` is the loop runner: you author a worker `AgentProfile` +and a persona, and supply two seams — `backendFor` (turn a profile into a +runnable backend) and `systemPromptOf` (render its system prompt). Three cells, smallest to largest: @@ -27,6 +28,6 @@ Optional env: `WORKER_MODEL` (the agent under test, default `gpt-4o-mini`), `ROU ## Offline -`evalPersona` and `runPersonaDispatch` both take a `backendFor` override — pass a fake backend and the -whole loop runs with no credentials and no network. See `src/conversation/eval-persona.test.ts` for -the `$0` offline pattern (it is part of `pnpm test`). +Both `runPersonaConversation` and `runPersonaDispatch` take a `backendFor` seam — pass a fake +backend and the whole loop runs with no credentials and no network. See +`src/conversation/run-persona.test.ts` for the `$0` offline pattern (it is part of `pnpm test`). diff --git a/examples/researcher-loop/README.md b/examples/researcher-loop/README.md index c5d02d7d..96193a07 100644 --- a/examples/researcher-loop/README.md +++ b/examples/researcher-loop/README.md @@ -1,17 +1,19 @@ # researcher-loop `researcherProfile()` (from `@tangle-network/agent-knowledge/profiles`) + -`runLoop()` + an inline fanout `Driver` — the researcher-flavoured -counterpart to [`coder-loop`](../coder-loop). Two parallel researcher -iterations attempt the same question; the validator scores citation density + -namespace scoping + per-item provenance; the kernel picks the -highest-scoring valid winner. +`runLoop()` + an inline fanout `Driver` — the primary, smallest example of the +`runLoop` kernel. Two parallel researcher attempts answer the same question; +the validator scores citation density + namespace scoping + per-item +provenance; the kernel picks the highest-scoring valid winner. -Same `runLoop` kernel and inline fanout driver as -[`coder-loop`](../coder-loop), only the profile differs. The load-bearing -branch below is candidate B: it leaks an item into `other-tenant`, so the -validator hard-fails the entire output and the kernel prunes it — leaving A -as the sole winner. +A **round** is one `plan → run workers → decide` cycle. This driver is +**single-round**: `plan()` returns two copies of the task on round 0, then `[]` +forever after — so it spawns two workers, scores both, and picks once. It never +reads a worker's output to write the next instruction. To see a driver that +*does* re-plan from worker output (the supervisor pattern), read +[`driver-loop/`](../driver-loop). The load-bearing branch below is candidate B: +it leaks an item into `other-tenant`, so the validator hard-fails the entire +output and the kernel prunes it — leaving A as the sole winner. ```mermaid flowchart TD diff --git a/examples/researcher-loop/researcher-loop.ts b/examples/researcher-loop/researcher-loop.ts index 36ebe00d..b46c1123 100644 --- a/examples/researcher-loop/researcher-loop.ts +++ b/examples/researcher-loop/researcher-loop.ts @@ -145,6 +145,11 @@ async function main(): Promise { const { output, validator, agentRunSpec } = researcherProfile({ task }) const driver: Driver = { name: 'fanout', + // A "round" = one plan → run workers → decide cycle. This driver is SINGLE-ROUND: + // it returns two copies of the task on round 0 (history empty) → two parallel + // workers (a "fanout"), then [] forever after → it spawns, scores, and picks ONCE. + // It never reads a worker's output to build the next prompt. For a driver that + // re-plans from worker output (the supervisor fold), see examples/driver-loop/. plan: async (task, history) => (history.length === 0 ? [task, task] : []), decide: (history) => (history.some((i) => i.verdict?.valid === true) ? 'pick-winner' : 'fail'), } diff --git a/examples/self-improving-loop/self-improving-loop.ts b/examples/self-improving-loop/self-improving-loop.ts index 59d7bf53..43384d7a 100644 --- a/examples/self-improving-loop/self-improving-loop.ts +++ b/examples/self-improving-loop/self-improving-loop.ts @@ -171,6 +171,11 @@ async function runVariant(profile: AgentProfile, scriptedReplies: ScriptedReply[ score: { composite: number } }> = [] for (const persona of PERSONAS) { + // A "shot" = one independent worker attempt/sample. `runMultishot` plays N shots + // in parallel and reports each; here each persona gets one shot (maxTurns:1 = one + // turn per shot). Contrast with a "round" (the driver-loop sense): a shot is ONE + // worker attempt; a round is one full plan → run workers → decide cycle that can + // span many shots. See examples/driver-loop/ for the round/shot vocabulary block. const result = await runMultishot({ profile, persona, shape, maxTurns: 1 }) const score = await runJudge(conversationJudge, { transcript: result.transcript, persona }) runs.push({ persona, result, score }) diff --git a/examples/supervise/supervise.ts b/examples/supervise/supervise.ts index 68f33e83..d393e96a 100644 --- a/examples/supervise/supervise.ts +++ b/examples/supervise/supervise.ts @@ -46,4 +46,4 @@ const result = await supervise( }, ) -console.log(result.kind === 'winner' ? '✓ delivered' : `✗ no winner (${result.kind})`) +console.log(result.kind === 'winner' ? '[OK] delivered' : `[--] no winner (${result.kind})`) diff --git a/examples/supervisor-loop/run-bridge.ts b/examples/supervisor-loop/run-bridge.ts index ec09f611..fcf5c3e0 100644 --- a/examples/supervisor-loop/run-bridge.ts +++ b/examples/supervisor-loop/run-bridge.ts @@ -94,8 +94,8 @@ async function main(): Promise { console.log( result.kind === 'winner' - ? `✅ delivered: ${JSON.stringify(result.out)}` - : `❌ no winner (${result.reason}, ${result.downCount} down)`, + ? `[OK] delivered: ${JSON.stringify(result.out)}` + : `[--] no winner (${result.reason}, ${result.downCount} down)`, ) } diff --git a/examples/supervisor-loop/run-sandbox.ts b/examples/supervisor-loop/run-sandbox.ts index c1c2eb37..aabcfa73 100644 --- a/examples/supervisor-loop/run-sandbox.ts +++ b/examples/supervisor-loop/run-sandbox.ts @@ -1,17 +1,24 @@ /** - * The sandbox path — each worker is a coding harness running in a real Tangle sandbox box. + * SANDBOXED SUPERVISOR — a supervisor that drives workers inside real Tangle sandbox boxes. + * + * The three-line shape: + * 1. the supervisor AUTHORS a worker `AgentProfile` (its standing instructions + harness), + * 2. each worker runs `runLoop` INSIDE a real box — `createExecutor({ backend: 'sandbox', + * harness, sandboxClient })` composes the kernel as a single-task leaf in a box running + * `harness` (opencode / claude-code / codex), + * 3. the supervisor reads each box's settled output and drives the next worker until the + * deliverable check passes. * * TANGLE_API_KEY=sk-... SANDBOX_BASE_URL=https://... pnpm tsx examples/supervisor-loop/run-sandbox.ts * * The supervisor is the canonical one-call `supervise()`; this runner supplies only the - * load-bearing sandbox seam — a real `SandboxClient` + `backend: 'sandbox'` (each worker leaf - * is `createExecutor({ backend: 'sandbox', harness, sandboxClient })`, which composes `runLoop` - * as a single-task leaf inside a box running `harness`). + * load-bearing sandbox seam — a real `SandboxClient` + `backend: 'sandbox'`. The WORKER BACKEND + * is the only knob: swap `backend: 'sandbox'` for `'bridge'` and the IDENTICAL supervisor drives + * local harness CLIs instead (see run-bridge.ts). * * The driver brain defaults to the router (the box key is already in hand); set DRIVER=scripted - * for the offline brain. The IDENTICAL supervisor runs against local harness CLIs by swapping - * the one backend value to `bridge` — see run-bridge.ts. For a fully offline, no-creds wiring - * check, see tests/loops/coordination-driver.test.ts and tests/supervisor-loop-example.test.ts. + * for the offline brain. For a fully offline, no-creds wiring check, see + * tests/loops/coordination-driver.test.ts and tests/supervisor-loop-example.test.ts. */ import { @@ -75,8 +82,8 @@ async function main(): Promise { console.log( result.kind === 'winner' - ? `✅ delivered: ${JSON.stringify(result.out)}` - : `❌ no winner (${result.reason}, ${result.downCount} down)`, + ? `[OK] delivered: ${JSON.stringify(result.out)}` + : `[--] no winner (${result.reason}, ${result.downCount} down)`, ) } diff --git a/examples/supervisor-loop/run-supervisor-mcp.ts b/examples/supervisor-loop/run-supervisor-mcp.ts index be020878..0d698b3e 100644 --- a/examples/supervisor-loop/run-supervisor-mcp.ts +++ b/examples/supervisor-loop/run-supervisor-mcp.ts @@ -166,11 +166,11 @@ async function main(): Promise { console.log('\n── verdict ──') if (result.kind === 'winner') { console.log( - `✅ supervisor drove a worker via the coordination MCP to a CHECKED delivery on backend "${backend.backend}".`, + `[OK] supervisor drove a worker via the coordination MCP to a CHECKED delivery on backend "${backend.backend}".`, ) console.log(` winner output: ${JSON.stringify(result.out)}`) } else { - console.log(`❌ no delivery (result=${result.kind}) — see supervisor transcript above`) + console.log(`[--] no delivery (result=${result.kind}) — see supervisor transcript above`) process.exitCode = 1 } } diff --git a/examples/supervisor-loop/shared.ts b/examples/supervisor-loop/shared.ts index b85f26d8..a0e8e5e8 100644 --- a/examples/supervisor-loop/shared.ts +++ b/examples/supervisor-loop/shared.ts @@ -68,9 +68,12 @@ export function scriptedSupervisorChat(workerCount: number, labelPrefix = 'solve let i = 0 return (messages) => { - // A real brain reads `messages` (the folded tool results) to decide; the - // scripted one advances its fixed plan. Touch `messages` so the shape is - // exercised. + // A real brain READS `messages` (the folded worker outputs + tool results) and + // composes its next move FROM them — that read is "the fold". This scripted brain + // deliberately IGNORES `messages` and advances a fixed plan, so do NOT mistake this + // for the supervisor pattern. To see a driver that actually reads the last worker's + // output and builds the next instruction from it, read examples/driver-loop/. + // We touch `messages` only so the shape is exercised: void messages.length const turn = turns[Math.min(i, turns.length - 1)] ?? { content: '', toolCalls: [] } i += 1 diff --git a/examples/ui-audit/README.md b/examples/ui-audit/README.md index 7a5eabdc..a1b2812d 100644 --- a/examples/ui-audit/README.md +++ b/examples/ui-audit/README.md @@ -6,8 +6,8 @@ The example uses a **stub judge** so it runs without an API key and demonstrates ## What the example shows -- A custom `SandboxClient` — the in-process browser+judge client — satisfies the kernel contract WITHOUT a real sandbox-SDK harness. The kernel does `client.create() → box.streamPrompt() → box.delete()` exactly as it does for `coderProfile`; the work happens in-process. -- A custom `Driver` (`lensCyclingDriver`) plans one iteration per lens in a fixed order. Supply your own `Driver` that authors its topology from the trace for richer policies. +- A custom `SandboxClient` — the in-process browser+judge client — satisfies the kernel contract WITHOUT a real sandbox-SDK harness. The kernel does `client.create() → box.streamPrompt() → box.delete()` exactly as it does for any profile (e.g. `researcherProfile`); the work happens in-process. +- A custom `Driver` (`lensCyclingDriver`) plans one iteration per lens in a fixed order. It is **content-blind**: it cycles a fixed lens list off `history.length` and never reads a worker's output to decide what to do next. For a driver that re-plans *from* worker output, see [`driver-loop/`](../driver-loop). Supply your own `Driver` that authors its topology from the trace for richer policies. - `appendFindings(workspaceDir, findings)` and `writeAuditIndex(workspaceDir)` persist self-contained GitHub-issue Markdown files plus a registry + index. ## Run diff --git a/examples/ui-audit/ui-audit.ts b/examples/ui-audit/ui-audit.ts index a78dab87..a6e4bb85 100644 --- a/examples/ui-audit/ui-audit.ts +++ b/examples/ui-audit/ui-audit.ts @@ -68,6 +68,10 @@ function lensCyclingDriver( ): Driver { let cursor = 0 return { + // This driver is CONTENT-BLIND by design: it cycles a fixed lens list off + // `history.length` and never reads a worker's output. So "driver" here is just + // a counter, NOT the output-driven re-planner. For a driver that builds the next + // prompt FROM the last worker's output (the fold), see examples/driver-loop/. // plan() returns Task[] — one lens per iteration, [] once all lenses are // cycled. The empty plan is what ends the loop: neither 'complete' nor // 'failed' is a terminal Decision (isTerminalDecision = stop|fail|done| From 56b143c1002f41ae34895a311667e1da9e2fd0b0 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 23 Jun 2026 16:40:59 -0600 Subject: [PATCH 2/2] =?UTF-8?q?docs(examples):=20align=20driver-loop=20voc?= =?UTF-8?q?abulary=20=E2=80=94=20a=20shot=20is=20one=20driver-worker=20exc?= =?UTF-8?q?hange?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit shot = round = turn = one (driver prompts worker -> worker output -> driver) exchange; 'many shots' is the sequence where each output folds into the next prompt. Remove the runMultishot 'parallel shots' section: runMultishot is a multi-turn conversation, not a fanout, so it mislabeled the breadth axis. Point to researcher-loop for fanout instead. --- examples/driver-loop/README.md | 34 +++--- examples/driver-loop/driver-loop.ts | 170 ++++++++-------------------- 2 files changed, 67 insertions(+), 137 deletions(-) diff --git a/examples/driver-loop/README.md b/examples/driver-loop/README.md index cb38b5ac..fb0d1c2d 100644 --- a/examples/driver-loop/README.md +++ b/examples/driver-loop/README.md @@ -13,23 +13,23 @@ pnpm tsx examples/driver-loop/driver-loop.ts ## Vocabulary -These words are used across every example and defined here. +These words are used across every example. The key thing: **a shot, a round, and a turn are the +same atom** — one driver↔worker exchange. "Many shots" is the *sequence* of them, not a fanout. | Term | Meaning | |---|---| -| **round** | One full driver cycle: `plan → run workers → decide`. The `runLoop` kernel runs exactly this, once per round. | -| **shot** | One independent worker attempt/sample. A single round can run many shots (a fanout). | -| **multishot** | N shots played in parallel. | -| **sample** | A strategy: take the best of N shots (breadth). | -| **refine** | A strategy: iterate-with-critique *across rounds* (depth) — what SECTION 1 of this example does. | +| **shot** = **round** = **turn** | ONE driver↔worker exchange: `driver ──prompt──▶ worker ──output (+traces/analysis)──▶ driver`. (`runLoop` increments a "round"; the multi-turn conversation primitive calls it a "turn"; people say "shot". Same atom.) | +| **the loop** (*"many shots"*) | A **sequence** of shots where each output **folds** into the next prompt: `prompt0 ▶ worker ▶ output0 ▶ driver ▶ prompt1 ▶ worker ▶ …`. Each shot builds on the last. **This example.** | +| **refine** | The strategy this file uses: keep taking shots, folding the last output into the next prompt, until a check passes (depth). | +| **fanout** (*best-of-N*) | A **different** axis: N *independent* shots with **no fold** between them, keep the best (breadth). This is **not** "many shots" in the looping sense — see `examples/researcher-loop`. | ## What the example shows -**SECTION 1 — ROUNDS (refine), the centerpiece.** A multi-round driver: +A multi-shot **refine** driver: -- **Round 0** — `driver.plan(task, history=[])`: no history yet, so it runs the worker once. The +- **Shot 0** — `driver.plan(task, history=[])`: no history yet, so it runs the worker once. The worker drafts a release note but forgets a required word, so the validator **rejects** it. -- **Round 1** — `driver.plan(task, history=[1 rejected])`: the driver READS the rejected draft +- **Shot 1** — `driver.plan(task, history=[1 rejected])`: the driver READS the rejected draft and its verdict out of `history`, then COMPOSES a corrective prompt *from that output* ("your draft was X, it was rejected because Y — rewrite it to mention Z"). The worker obeys the new prompt and the validator **passes**. @@ -43,13 +43,13 @@ is visible. ```mermaid flowchart TD task["NoteTask\nprompt: draft a release note"] --> plan0 - subgraph r0["ROUND 0 — plan(task, history=[])"] + subgraph s0["SHOT 0 — plan(task, history=[])"] plan0["driver runs the worker once"] end plan0 --> w0["worker → 'Shipped one-click restore for failed deploys.'"] w0 --> v0{"validator: mentions 'rollback'?"} v0 -->|no — REJECT| fold["THE FOLD\ndriver reads the rejected draft\n+ builds a corrective prompt from it"] - subgraph r1["ROUND 1 — plan(task, history=[1 rejected])"] + subgraph s1["SHOT 1 — plan(task, history=[1 rejected])"] fold end fold --> w1["worker → '…with an instant rollback path…'"] @@ -57,10 +57,10 @@ flowchart TD v1 -->|yes — PASS| done["decide → pick-winner"] ``` -**SECTION 2 — SHOTS (multishot), the contrast.** Three independent attempts at the same task, -in parallel, with **no fold between them**. This is the *other* axis: a round refines depth-wise -(each round improves on the last); a shot explores breadth-wise (many tries at once). Seeing them -side by side is the cleanest way to internalize round vs shot. +**Shot vs fanout (the other axis).** This file refines *depth*-wise: each shot improves on the +last by folding its output forward. The orthogonal move is *breadth* — fire N independent shots at +once with no fold between them and keep the best (a fanout / best-of-N). That's a different example: +see `examples/researcher-loop`, whose driver is single-round and content-blind on purpose. ## Where this goes next @@ -68,5 +68,5 @@ side by side is the cleanest way to internalize round vs shot. for you. - `examples/supervisor-loop/` — the same supervisor over a real worker backend (sandbox box / local cli-bridge), worker backend as the only knob. -- `examples/researcher-loop/` and `examples/ui-audit/` — `runLoop` drivers that are *single-round* - and *content-blind* on purpose (they never fold); read those to see the contrast with this one. +- `examples/researcher-loop/` — a `runLoop` driver that is *single-round* and *content-blind* on + purpose (a fanout, never a fold); read it to see the breadth axis next to this file's depth axis. diff --git a/examples/driver-loop/driver-loop.ts b/examples/driver-loop/driver-loop.ts index 6044a3bd..574bfe81 100644 --- a/examples/driver-loop/driver-loop.ts +++ b/examples/driver-loop/driver-loop.ts @@ -1,24 +1,33 @@ /** * driver-loop — SEE THE FOLD. * - * This is the one concept that makes the whole supervisor/driver story click: a driver - * does not just count iterations. It READS the last worker's actual output and WRITES the - * next instruction FROM that output. That read-then-rewrite is "the fold". Everything else - * in this repo — supervise(), the coordination MCP, the self-improvement loop — is built on - * top of this single move. + * The one concept that makes the whole supervisor/driver story click: a driver does not just + * count attempts. It READS the last worker's output and WRITES the next instruction FROM it. + * That read-then-rewrite is "the fold". supervise(), the coordination MCP, and the + * self-improvement loop are all built on this single move. * - * ── Vocabulary (used everywhere, defined here) ────────────────────────────────────────── - * • round — one full driver cycle: plan → run workers → decide. The `runLoop` kernel - * calls plan(), runs the planned workers, then calls decide(), once per round. - * • shot — one independent worker attempt/sample. A round can run many shots (a fanout). - * • multishot — N shots played in parallel (see SECTION 2 below). - * • sample — a strategy: take the best of N shots (breadth). - * • refine — a strategy: iterate-with-critique ACROSS rounds (depth) — this file's SECTION 1. + * ── Vocabulary (one exchange, three names — all the SAME atom) ──────────────────────────── * - * SECTION 1 (the centerpiece) is a multi-ROUND refine driver. Round 0 asks the worker to draft - * a release note; the validator rejects it for missing a required word; the driver READS that - * rejected draft and BUILDS a corrective prompt from it; round 1 re-runs with that prompt and - * passes. SECTION 2 contrasts it with a multi-SHOT run so the two axes sit side by side. + * • shot = round = turn — ONE driver↔worker exchange: + * + * driver ──prompt──▶ worker ──output (+ traces / analysis)──▶ driver + * + * The driver sends a prompt, the worker runs, its output comes back, the driver reads it. + * (`runLoop` increments a "round"; the multi-turn conversation primitive calls it a "turn"; + * people say "shot". Same atom — pick whichever word you like.) + * + * • the loop ("many shots") — a SEQUENCE of shots where each output FOLDS into the next prompt: + * + * prompt0 ▶ worker ▶ output0 ▶ driver ▶ prompt1 ▶ worker ▶ output1 ▶ driver ▶ … + * + * Each shot builds on the last. THIS FILE is exactly that, and it's almost always what you want. + * + * • fanout (breadth / best-of-N) — a DIFFERENT axis: N independent shots with NO fold between + * them, keep the best. That is NOT "many shots" in the looping sense. See examples/researcher-loop. + * + * This file is a multi-shot REFINE driver. Shot 0 drafts a release note; the validator rejects it + * for a missing word; the driver READS that rejected draft and BUILDS a corrective prompt from it; + * shot 1 re-runs with that prompt and passes — proving the loop's behavior changed BECAUSE of the fold. * * Fully offline — the worker is a scripted client keyed on the prompt, so it runs with zero * credentials (the same offline pattern self-improving-loop uses). @@ -26,11 +35,6 @@ * Run: pnpm tsx examples/driver-loop/driver-loop.ts */ -import { - type MultishotPersona, - type MultishotShape, - runMultishot, -} from '@tangle-network/agent-eval/multishot' import { type DefaultVerdict, type Driver, @@ -45,7 +49,7 @@ import type { AgentProfile, SandboxEvent, SandboxInstance } from '@tangle-networ // product would validate something richer; the required word keeps the example deterministic. interface NoteTask { feature: string - /** The next instruction the worker should run. The DRIVER rewrites this between rounds. */ + /** The next instruction the worker should run. The DRIVER rewrites this between shots. */ prompt: string } interface NoteOutput { @@ -57,7 +61,7 @@ const requiredWord = 'rollback' // A worker is just something that takes a prompt and streams back events. Here we fake it: // the FIRST prompt produces a draft that forgets the required word (so it will be rejected); // any prompt that mentions the required word produces a corrected draft. That keyed behavior -// is what lets the example PROVE the fold worked: round 1 only passes because the driver put +// is what lets the example PROVE the fold worked: shot 1 only passes because the driver put // the right correction into the prompt. function scriptedWorkerClient(): { create(): Promise } { return { @@ -107,30 +111,30 @@ const validator: Validator = { } // ── THE DRIVER — this is the example ──────────────────────────────────────────────────── -// A driver is two functions: plan() (what to run this round) and decide() (are we done?). -// The fold lives inside plan(): on round > 0 it READS history (the last worker's real output +// A driver is two functions: plan() (what to run this shot) and decide() (are we done?). +// The fold lives inside plan(): on shot > 0 it READS history (the last worker's real output // + its verdict) and COMPOSES the next prompt FROM that output. // // Decision values: the kernel STOPS the loop when decide() returns a TERMINAL value // ('stop' | 'pick-winner' | 'fail' | 'done'). Any other string is non-terminal → the loop -// runs another round. That's the footgun for a refine driver: if decide() returned 'fail' -// after a failing round 0, the loop would stop BEFORE it ever got to refine. So we return the +// runs another shot. That's the footgun for a refine driver: if decide() returned 'fail' +// after a failing shot 0, the loop would stop BEFORE it ever got to refine. So we return the // non-terminal 'refine' to keep going, and only the terminal 'pick-winner'/'fail' when truly done. type NoteDecision = 'refine' | 'pick-winner' | 'fail' -function refineDriver(maxRounds: number): Driver { +function refineDriver(maxShots: number): Driver { return { name: 'refine', async plan(task, history) { - // ROUND 0 — no history yet, so just run the initial task once. + // SHOT 0 — no history yet, so just run the initial task once. if (history.length === 0) return [task] - // We already passed? Stop refining (return [] → no more workers this round). + // We already passed? Stop refining (return [] → no more workers). const last = history[history.length - 1] if (last?.verdict?.valid) return [] - // Round cap: stop even if still failing. - if (history.length >= maxRounds) return [] + // Shot cap: stop even if still failing. + if (history.length >= maxShots) return [] // ── THE FOLD, PART 1: INGEST the last worker's actual output ──────────────────────── // `history[history.length - 1].output` is the real answer the previous worker produced; @@ -152,20 +156,20 @@ function refineDriver(maxRounds: number): Driver it.verdict?.valid)) return 'pick-winner' - return history.length < maxRounds ? 'refine' : 'fail' + return history.length < maxShots ? 'refine' : 'fail' }, } } -// ── SECTION 1: run the refine (multi-round) driver ────────────────────────────────────── -async function runRefine(): Promise { - console.log('── SECTION 1 · ROUNDS (refine) — driver reads worker output, rewrites the prompt') +// ── Run the refine (multi-shot) driver ────────────────────────────────────────────────── +async function main(): Promise { + console.log('driver-loop · the driver reads each shot’s output and rewrites the next prompt\n') const task: NoteTask = { feature: 'one-click restore', @@ -176,7 +180,7 @@ async function runRefine(): Promise { driver: refineDriver(3), agentRun: { profile: { name: 'note-writer' } as AgentProfile, - // Each round's task carries the prompt the driver authored; this is how the rewritten + // Each shot's task carries the prompt the driver authored; this is how the rewritten // instruction actually reaches the worker. taskToPrompt: (t) => t.prompt, }, @@ -187,90 +191,16 @@ async function runRefine(): Promise { maxIterations: 5, }) - // One iteration == one round here (the driver runs a single worker per round). + // One iteration == one shot here (the driver runs a single worker per shot). for (const it of result.iterations) { const verdict = it.verdict?.valid ? 'PASS' : 'reject' - console.log(` ROUND ${it.index}: [${verdict}] note = "${it.output?.note ?? ''}"`) + console.log(`SHOT ${it.index}: [${verdict}] note = "${it.output?.note ?? ''}"`) if (!it.verdict?.valid && it.index < result.iterations.length - 1) { - console.log(' └─ driver folds this rejected output into round', it.index + 1) + console.log(` └─ driver folds this rejected output into shot ${it.index + 1}`) } } - console.log(` decision: ${result.decision}`) - if (result.winner) console.log(` winner: round ${result.winner.iterationIndex}`) - console.log() -} - -// ── SECTION 2: contrast — SHOTS (multishot), the OTHER axis ────────────────────────────── -// A round refines DEPTH-wise (each round improves on the last). A shot explores BREADTH-wise: -// N independent attempts at the SAME task, in parallel, no fold between them. runMultishot is -// the substrate primitive for that. We run it with a mocked router so it stays offline. -interface SimplePersona extends MultishotPersona { - id: string -} -async function runShots(): Promise { - console.log('── SECTION 2 · SHOTS (multishot) — N independent attempts, no fold between them') - - const restore = installMockRouter([ - { text: 'Attempt A: one-click restore with a rollback path.' }, - { text: 'Attempt B: one-click restore, instant rollback if a deploy fails.' }, - { text: 'Attempt C: one-click restore; rollback included.' }, - ]) - process.env.TANGLE_API_KEY ??= 'test-key' - try { - const profile: AgentProfile = { - name: 'note-writer', - prompt: { systemPrompt: 'Write a one-line release note that mentions rollback.' }, - } - const shape: MultishotShape = { - buildOpener: () => 'Write the release note.', - buildDriverSystemPrompt: () => 'You are drafting a release note.', - } - // Three personas == three shots; they run independently. There is no round-to-round fold. - for (const id of ['shot-0', 'shot-1', 'shot-2']) { - const res = await runMultishot({ - profile, - persona: { id } as SimplePersona, - shape, - maxTurns: 1, - }) - // Grab the worker's reply: the last non-user, non-tool message in the transcript. - const reply = [...res.transcript] - .reverse() - .find((m) => m.role !== 'user' && m.role !== 'tool') - console.log(` ${id} (parallel): "${reply?.content ?? ''}"`) - } - } finally { - restore() - } - console.log() - console.log(' ROUND vs SHOT: a round folds the last output into the next prompt (depth);') - console.log(' a shot is one independent attempt; multishot plays N shots at once (breadth).') -} - -// Minimal offline router stub (same pattern as self-improving-loop) so SECTION 2 needs no creds. -function installMockRouter(replies: Array<{ text: string }>): () => void { - const original = global.fetch - let i = 0 - global.fetch = (async () => { - const r = replies[i++ % replies.length] - return { - ok: true, - status: 200, - json: async () => ({ - choices: [{ message: { content: r?.text ?? '' } }], - usage: { prompt_tokens: 80, completion_tokens: 20 }, - }), - text: async () => 'ok', - } as Response - }) as typeof fetch - return () => { - global.fetch = original - } -} - -async function main(): Promise { - await runRefine() - await runShots() + console.log(`\ndecision: ${result.decision}`) + if (result.winner) console.log(`winner: shot ${result.winner.iterationIndex}`) } main().catch((err) => {