diff --git a/bench/HARNESS.md b/bench/HARNESS.md index da1faa55..8d72346a 100644 --- a/bench/HARNESS.md +++ b/bench/HARNESS.md @@ -164,6 +164,17 @@ via the router, is graded by the runnable checker, and that `BenchScore` is the Offline plumbing test (no creds): `tsx src/gate.test.mts`. The gate runs through the SAME recursive atom every personified loop uses. +## "Supervisor" (iterate/decompose) vs blind — through the PUBLISHED suite +The supervisor-vs-blind gate is NOT a bespoke harness: it is `runBenchmark([sample, refine, …])` +over an Environment. blind = `sample` (best-of-k); "supervisor" = `refine`/`sampleThenRefine` +(depth: attempt→firewalled-analyst-steer→retry — *"a multi-agent team is just a Strategy whose driver +spawns several agents"*). Equal compute by the substrate's CONSERVED budget; the deployable check is +the Environment's `score`; the can't-fake-the-check firewall is built in. Run it on the HARD real +domain via `commit0-env-run.mts` (above) or the toy `strategy-demo.mts` (offline). The LLM +agent-driver (an LLM that itself decides spawns via the coordination MCP) is the SEPARATE product +path — `atom-mcp-e2e.mts` / `atom-commit0.mts` — not a strategy. Evolve any strategy on a frozen +holdout with `runStrategyEvolution`. + ## Generate a fresh corpus + gate it The rollout generators now live with their domains: the recursive gate (`gate-cli.mts`) and the optimization-suite env runs (`commit0-env-run.mts`, diff --git a/bench/src/atom-humaneval.mts b/bench/src/atom-humaneval.mts index 0a394b78..64c43f5f 100644 --- a/bench/src/atom-humaneval.mts +++ b/bench/src/atom-humaneval.mts @@ -189,7 +189,7 @@ async function driveTask( }) const tree = await journal.loadTree(runId) const tokens = (tree ?? []) - .filter((e): e is Extract<(typeof tree)[number], { kind: 'settled' }> => e.kind === 'settled') + .filter((e): e is Extract[number], { kind: 'settled' }> => e.kind === 'settled') .reduce((s, e) => s + e.spent.tokens.input + e.spent.tokens.output, 0) const replay = renderReplayHtml(recorder.timeline(runId), { title: `${task.taskId} · driver=${driverCfg.model}`, diff --git a/bench/src/atom-mcp-e2e.mts b/bench/src/atom-mcp-e2e.mts index 04c8db76..3da19796 100644 --- a/bench/src/atom-mcp-e2e.mts +++ b/bench/src/atom-mcp-e2e.mts @@ -13,7 +13,7 @@ */ import { execFileSync } from 'node:child_process' -import { cpSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs' +import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs' import { tmpdir } from 'node:os' import { dirname, join } from 'node:path' import { fileURLToPath } from 'node:url' @@ -26,9 +26,12 @@ import { createSupervisor, type Executor, type ExecutorResult, + gitWorkspace, InMemoryResultBlobStore, InMemorySpawnJournal, + runInWorkspace, type Scope, + type Workspace, } from '../../src/runtime/index' import { asAuthoredProfile } from '../../src/runtime/supervise/authoring' import { serveCoordinationMcp } from '../../src/runtime/supervise/coordination-mcp' @@ -41,14 +44,28 @@ const SKILL_MD = readFileSync(join(REPO, 'skills', 'supervise', 'SKILL.md'), 'ut const TASK = 'In solution.py, implement add(a, b) so it returns the sum a + b and test_solution.py passes.' -function makeTaskTemplate(): string { - const dir = mkdtempSync(join(tmpdir(), 'e2e-task-')) - writeFileSync(join(dir, 'solution.py'), 'def add(a, b):\n raise NotImplementedError\n') +/** Seed a bare git repo with the failing task — the SHARED workspace ref every worker clones. */ +function seedWorkspaceRepo(): string { + const git = (args: string[], cwd?: string): void => { + execFileSync('git', ['-c', 'core.hooksPath=/dev/null', '-c', 'user.email=t@t', '-c', 'user.name=t', ...args], { + cwd, + stdio: 'pipe', + }) + } + const bare = `${mkdtempSync(join(tmpdir(), 'e2e-ws-'))}.git` + git(['init', '--bare', '-b', 'main', bare]) + const seed = mkdtempSync(join(tmpdir(), 'e2e-seed-')) + git(['clone', bare, seed]) + writeFileSync(join(seed, 'solution.py'), 'def add(a, b):\n raise NotImplementedError\n') writeFileSync( - join(dir, 'test_solution.py'), + join(seed, 'test_solution.py'), 'from solution import add\nassert add(2, 3) == 5\nassert add(-1, 1) == 0\nassert add(0, 0) == 0\nprint("PASS")\n', ) - return dir + git(['add', '-A'], seed) + git(['commit', '-m', 'task'], seed) + git(['push', 'origin', 'main'], seed) + rmSync(seed, { recursive: true, force: true }) + return bare } /** The deployable check: run the test in the worker's cwd. Exit 0 = delivered. No LLM judge. */ @@ -83,35 +100,41 @@ async function bridgeChat(opts: { const transcripts: Array<{ who: string; said: string; delivered?: boolean }> = [] -/** A WORKER = a real opencode coding session in its OWN cwd, graded by the real test. */ -function makeWorker(rawProfile: unknown, templateDir: string, n: number): Agent { +/** A WORKER = a real opencode coding session in a clone of the SHARED workspace, graded by the + * real test; its delivery is committed back so the next worker builds on it (not isolated). */ +function makeWorker(rawProfile: unknown, ws: Workspace, n: number): Agent { const p = asAuthoredProfile(rawProfile) const name = p?.name ?? `worker-${n}` let artifact: ExecutorResult | undefined const inner: Executor = { runtime: 'router', async execute() { - const cwd = mkdtempSync(join(tmpdir(), 'e2e-worker-')) - cpSync(templateDir, cwd, { recursive: true }) const sys = p?.systemPrompt ?? TASK - const said = await bridgeChat({ - messages: [ - { - role: 'user', - content: `${sys}\n\nYou are working in the current directory. Edit the files so that running \`python3 test_solution.py\` prints PASS. Do it now.`, - }, - ], - cwd, - }) - const delivered = checkPasses(cwd) - transcripts.push({ who: name, said: said.slice(0, 300), delivered }) + const run = await runInWorkspace( + ws, + async (cwd) => { + const said = await bridgeChat({ + messages: [ + { + role: 'user', + content: `${sys}\n\nYou are working in the current directory (it already holds prior workers' committed progress). Edit the files so that running \`python3 test_solution.py\` prints PASS. Do it now.`, + }, + ], + cwd, + }) + const valid = checkPasses(cwd) + transcripts.push({ who: name, said: said.slice(0, 300), delivered: valid }) + return { valid, value: said.slice(0, 120), message: `${name}: ${valid ? 'delivered' : 'wip'}` } + }, + { tmpPrefix: 'e2e-worker-', commitOnInvalid: true }, + ) + const delivered = run.valid artifact = { outRef: contentAddress(`${name}:${delivered}`), - out: { worker: name, delivered, profileSystemPrompt: sys.slice(0, 120) }, + out: { worker: name, delivered, rev: run.commit?.ok ? run.commit.rev : undefined, profileSystemPrompt: sys.slice(0, 120) }, verdict: { valid: delivered, score: delivered ? 1 : 0 }, spent: { iterations: 1, tokens: { input: 0, output: 0 }, usd: 0, ms: 0 }, } - rmSync(cwd, { recursive: true, force: true }) return artifact }, teardown: () => Promise.resolve({ destroyed: true }), @@ -125,8 +148,9 @@ function makeWorker(rawProfile: unknown, templateDir: string, n: number): Agent< } async function main(): Promise { - console.log(`atom-mcp-e2e: model=${MODEL} (real boxes, real MCP, real test)`) - const templateDir = makeTaskTemplate() + console.log(`atom-mcp-e2e: model=${MODEL} (real boxes, real MCP, real test, shared workspace)`) + const bareRef = seedWorkspaceRepo() + const ws = gitWorkspace({ ref: bareRef }) const blobs = new InMemoryResultBlobStore() let n = 0 @@ -136,7 +160,7 @@ async function main(): Promise { const mcp = await serveCoordinationMcp({ scope, blobs, - makeWorkerAgent: (raw) => makeWorker(raw, templateDir, n++), + makeWorkerAgent: (raw) => makeWorker(raw, ws, n++), perWorker: { maxIterations: 2, maxTokens: 200_000 }, }) // The supervisor's cwd carries the REAL skill file (opencode loads it from the cwd skill dirs). @@ -178,7 +202,7 @@ async function main(): Promise { maxDepth: 4, now: () => Date.now(), }) - rmSync(templateDir, { recursive: true, force: true }) + rmSync(bareRef, { recursive: true, force: true }) console.log('\n── transcripts (real driver↔worker) ──') for (const t of transcripts) { diff --git a/docs/research/interactive-sessions-spec.md b/docs/research/interactive-sessions-spec.md new file mode 100644 index 00000000..022bf777 --- /dev/null +++ b/docs/research/interactive-sessions-spec.md @@ -0,0 +1,75 @@ +# Spec — interactive (tmux) harness sessions + live streaming + +**Vision (one sentence):** instead of headless one-shot CLI calls, each agent in a supervised run is a **live, interactive harness session in its own tmux window** (driveable, observable, resumable), the whole agent tree is one tmux session, and it streams to a browser — composing with the recorded animated replay. + +**Why now:** the whole real chain already delivers — an opencode supervisor drives opencode workers via the coordination MCP, a real deployable check gates delivery (`bench/src/atom-mcp-e2e.mts`, `972707f`). What's missing is (a) the agents run *headless* (one prompt → output), so you can't watch or interact, and (b) the harness-specific glue lives in a bench script, not the substrate. This spec turns both into a real, generalized capability. + +## Placement — who owns what (obeys the AgentProfile law + the layering) + +The law: *an agent IS its AgentProfile; you change behavior by authoring the profile and letting the substrate materialize it — never specialize the runtime to a harness.* That decides the split cleanly: + +| Layer | Owns | Why | +|---|---|---| +| **agent-runtime** (this repo) | The **recursion + the ports**: the coordination MCP over the Scope (`serveCoordinationMcp`, done), a generic **`session` Executor** that opens/drives/observes a session via the substrate's API (NOT tmux-aware), the shared `Workspace` seam, the journal→replay. | The runtime stays harness-agnostic. It drives; it never spawns tmux or knows what opencode is. | +| **agent-dev-container** (adc) | The **materialization**: given an `AgentProfile` + cwd + mcp config, stand up the harness as an **interactive tmux window** (the TUI, not `run`), materialize the FULL profile (skills as real SKILL.md files, tools, model, mcp), capture (`pipe-pane`) + stream (`ttyd`). Exposes a **session API** (create / send / observe / status / kill). | "the container where the agents actually live" — Drew. This is the harness-specific layer; it belongs in the substrate, never the runtime. | +| **cli-bridge** | Stays the *headless* harness materializer (the test target + the fast path). Optionally grows the same session API for local runs. | Already proven; the adc is the richer/interactive home. | +| **sandbox SDK** | The `AgentProfile` manifest + box abstraction the adc is a flavor of. | Where the profile shape + `resources.skills` materialization already live. | + +**The seam** = a small **session API** the adc exposes and the runtime's `session` Executor consumes: +`POST /sessions {profile, cwd, mcp} → {id, ttydUrl}` · `POST /sessions/:id/send {text}` · `GET /sessions/:id/stream` (SSE: harness output + a done/settle signal) · `GET /sessions/:id/status` · `DELETE /sessions/:id`. The runtime drives the recursion through the coordination MCP; the substrate drives the *harness* through this API. + +## Where the issue goes +- **Primary issue → `tangle-network/agent-dev-container`** (the materialization + the session API + ttyd). This spec is the design ref. +- **Companion issue → `tangle-network/agent-runtime`** (the generic `session` Executor + the shared `Workspace` wiring + replay-compose). Small; mostly the executor seam. +- **Track on `ops-board`** (lane: eng, owner: claude) with measurable done-criteria = the e2e checklist below. + +## End-to-end checklist (the map to "done") + +### Phase 0 — preconditions (DONE) +- [x] Coordination MCP over a live Scope (`serveCoordinationMcp`, real test). +- [x] Proof a coding harness mounts + calls it (`mcp-mount-probe`). +- [x] Whole headless e2e delivers (`atom-mcp-e2e`). +- [x] Standard `skills/supervise/SKILL.md`. + +### Phase 1 — substrate: AgentProfile materialization (adc + bridge) *(Drew's "materialize the entire profile")* +- [ ] Materialize `resources.skills` as real `SKILL.md` files in the harness skill dir (opencode `~/.config/opencode/skill/` + project `.opencode/skill/`; verify the exact dir per harness) — loaded natively, NOT a prompt note. +- [ ] Materialize tools, model, system prompt, mcp (mcp already works — `type:'http'`). +- [ ] One `materializeAgentProfile(profile, dir)` per harness; remove the bench script's cwd-writes. +- [ ] Exit: a profile with a skill drives behavior with zero prompt-stuffing (probe: agent uses a skill it was never told about in the prompt). + +### Phase 2 — substrate: interactive tmux session + session API (adc) +- [ ] `tmux new-session`/`new-window` per run/agent; run the harness in **interactive** mode (TUI), one window per agent, named by agent id. +- [ ] Drive: send the prompt (send-keys or the harness's stdin protocol); detect completion (harness done-signal / sentinel) → emit a settle event. +- [ ] Capture: `pipe-pane` → a transcript stream (for the journal). +- [ ] The session API (create/send/stream/status/kill) over HTTP. +- [ ] Resource governance: max concurrent windows, per-session timeout, cleanup on settle/crash. +- [ ] Exit: `POST /sessions` with a profile → a live tmux window you can `tmux attach` to; `/stream` yields output + a done signal. + +### Phase 3 — runtime: the generic `session` Executor (agent-runtime) +- [ ] A `session` backend on the `Executor` port: `execute` calls the substrate session API (create → send task → stream until done) and settles with the result; `deliver` → `/send` (steer); `teardown` → `/kill`. Harness-agnostic. +- [ ] Wire `makeWorkerAgent` (coordination MCP) → the `session` executor, selected by the worker's `AgentProfile.backend`. +- [ ] Exit: `spawn_worker` → a worker that runs as a live interactive session, settles on its deployable check. + +### Phase 4 — shared workspace (agent-runtime) *(the e2e's open design point)* +- [ ] Supervisor + its workers share ONE `Workspace` (gitWorkspace) — workers branch/worktree, deliver back so the supervisor (and the next worker) build on one artifact. Fixes the "files missing" confusion. +- [ ] Exit: a 2-worker run where worker-2 builds on worker-1's committed output. + +### Phase 5 — streaming + viz (adc + the viewer) +- [ ] `ttyd` serves the run's tmux session over a websocket; auth (bearer); a stable URL per run. +- [ ] A viewer page: the live tmux stream (now) beside the **animated replay** (the recorded tree) + the topology — one screen, live + history. +- [ ] Exit: open the URL, watch the supervisor + worker panes work in real time; scrub the replay after. + +### Phase 6 — prove it e2e (no mock) +- [ ] The whole chain on interactive sessions: supervisor (tmux) authors profiles → `spawn_worker` → worker (tmux) codes in the shared workspace → real test gates → delivered — all streamed live, all journaled, replayable. +- [ ] Retire `atom-mcp-e2e`'s harness-specific shortcuts (now: author profiles, the substrate materializes). +- [ ] Exit: a recorded run URL + the replay + green deployable check. + +## Open design points (decide during Phase 2–4) +- **Interactive vs headless harness mode:** does opencode/claude-code expose a driveable interactive TUI, or do we run `run` *inside* the pane for the live-output view? (Headless-in-a-pane is the cheap first cut; true interactive is the goal.) +- **Completion detection** in a TUI (sentinel vs a harness done event). +- **Session lifecycle:** resume after a crash (the journal already supports replay/resume — extend to sessions). +- **Security:** ttyd exposure + the coordination MCP exposure (bind localhost / authd tunnel). +- **Concurrency:** N agents = N windows; the adc's resource limits. + +## Net +The runtime is essentially done for this (coordination MCP + the executor port + replay). The new work is a **substrate capability in the adc** (interactive tmux sessions + full-profile materialization + ttyd), reached through one small session API and one generic `session` executor in the runtime. Nothing here specializes the runtime to a harness. diff --git a/skills/build-with-agent-runtime/SKILL.md b/skills/build-with-agent-runtime/SKILL.md index 52401922..bfd4251a 100644 --- a/skills/build-with-agent-runtime/SKILL.md +++ b/skills/build-with-agent-runtime/SKILL.md @@ -68,7 +68,7 @@ signature + the exact "do NOT build". | **Gate: ship/hold from a `BenchmarkReport`** (per-task cells) | `promotionGate({ report, incumbent, candidate })` — `/runtime` | canonical-api §3.4 | | **Run the full multi-generation flywheel + certify** | `runStrategyEvolution(config)` — `/runtime` | canonical-api §3.4 | | **Compose the prod sandbox profile** (eval/prod parity) | `composeProductionAgentProfile(base, opts)` — `/mcp` | canonical-api §3.2 | -| **Observe a run** (cost/time waterfall, live tree, OTLP) | `createWaterfallCollector` / `createTopologyView` / `createOtelExporter` via `composeRuntimeHooks(...)` — root | canonical-api §3.5 | +| **Observe a run** (cost/time waterfall, live tree, OTLP) | `createWaterfallCollector` / `createOtelExporter` via `composeRuntimeHooks(...)` — root; `createTopologyView` / `renderTopologyTree` — `/topology` | canonical-api §3.5 | | **State any A/B claim** | `pairedLift` (bench) over `pairedBootstrap`/`heldoutSignificance` (substrate) | canonical-api §3.5 | | **Observe/ship with billing-boundary** | `withTangleIntelligence(agent, { project, effort })` — `/intelligence` | canonical-api §7 (now live on main — verify) | diff --git a/src/runtime/index.ts b/src/runtime/index.ts index fc72d48a..e4a924e1 100644 --- a/src/runtime/index.ts +++ b/src/runtime/index.ts @@ -311,6 +311,9 @@ export { isDriverSpec, withDriverExecutor, } from './supervise/driver-executor' +// Supervisor-as-MCP: serve the coordination verbs as a real HTTP MCP over a live Scope, so any +// harness (claude-code / codex / opencode) BECOMES the supervisor by mounting one MCP server. +export { type CoordinationMcpHandle, serveCoordinationMcp } from './supervise/coordination-mcp' // The ONE built-in executor entrypoint: backend-as-data (`createExecutor({backend})`). // The per-backend factories are internal case-arms; BYO agents implement `Executor`. export { @@ -407,7 +410,9 @@ export { gitWorkspace, jjWorkspace, localShell, + runInWorkspace, type Shell, type Workspace, type WorkspaceCommit, + type WorkspaceRun, } from './workspace' diff --git a/src/runtime/workspace.ts b/src/runtime/workspace.ts index 3246a460..569aa66a 100644 --- a/src/runtime/workspace.ts +++ b/src/runtime/workspace.ts @@ -132,6 +132,43 @@ export function jjWorkspace(opts: GitWorkspaceOptions): Workspace { } } +export interface WorkspaceRun { + readonly valid: boolean + readonly value: T + /** Present when a commit was attempted (valid, or `commitOnInvalid`). */ + readonly commit?: WorkspaceCommit +} + +/** + * Run a worker `body` inside a FRESH clone of a shared `Workspace`, then commit its work back + * so the next worker (or the supervisor) builds on it. This is the seam that turns isolated + * per-worker cwds into one compounding artifact — `body` gets a real materialized dir, its + * delivery is committed to the shared ref iff it's valid (a conflict is returned, never thrown). + * The clone is removed after; durable state lives only in the ref. + */ +export async function runInWorkspace( + ws: Workspace, + body: (cwd: string) => Promise<{ valid: boolean; value: T; message?: string }>, + opts: { tmpPrefix?: string; commitOnInvalid?: boolean } = {}, +): Promise> { + const { mkdtempSync, rmSync } = await import('node:fs') + const { tmpdir } = await import('node:os') + const { join } = await import('node:path') + const dir = mkdtempSync(join(tmpdir(), opts.tmpPrefix ?? 'ws-run-')) + try { + await ws.materialize(dir) + const r = await body(dir) + if (r.valid || opts.commitOnInvalid) { + const message = r.message ?? (r.valid ? 'worker: delivered' : 'worker: wip') + const commit = await ws.commit(dir, message) + return { valid: r.valid, value: r.value, commit } + } + return { valid: r.valid, value: r.value } + } finally { + rmSync(dir, { recursive: true, force: true }) + } +} + function tail(s: string): string { return s.slice(-400) } diff --git a/tests/loops/workspace.test.ts b/tests/loops/workspace.test.ts index 38b40543..01bd9fc8 100644 --- a/tests/loops/workspace.test.ts +++ b/tests/loops/workspace.test.ts @@ -3,7 +3,16 @@ import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'no import { tmpdir } from 'node:os' import { join } from 'node:path' import { afterEach, beforeEach, describe, expect, it } from 'vitest' -import { gitWorkspace, jjWorkspace } from '../../src/runtime/workspace' +import { gitWorkspace, jjWorkspace, runInWorkspace } from '../../src/runtime/workspace' + +const hasPython = (() => { + try { + execFileSync('python3', ['--version'], { stdio: 'pipe' }) + return true + } catch { + return false + } +})() /** jj is optional and absent in CI — its block skips unless the binary is present. */ const hasJj = (() => { @@ -66,6 +75,66 @@ describe('gitWorkspace', () => { expect(readFileSync(join(w2, 'a.txt'), 'utf-8')).toBe('one\n') }) + it.skipIf(!hasPython)( + 'runInWorkspace: a second worker builds on the first, gated by a real test', + async () => { + // Seed the shared ref with a failing task (two functions to implement, one test). + const seed = fresh() + await gitWorkspace({ ref: bare }).materialize(seed) + writeFileSync( + join(seed, 'solution.py'), + 'def add(a, b):\n raise NotImplementedError\n\n\ndef mul(a, b):\n raise NotImplementedError\n', + ) + writeFileSync( + join(seed, 'test_solution.py'), + 'from solution import add, mul\nassert add(2, 3) == 5\nassert mul(2, 3) == 6\nprint("PASS")\n', + ) + git(['add', '-A'], seed) + git(['commit', '-m', 'task'], seed) + git(['push', 'origin', 'main'], seed) + + const ws = gitWorkspace({ ref: bare }) + const runTest = (cwd: string): boolean => { + try { + execFileSync('python3', ['test_solution.py'], { cwd, stdio: 'pipe', timeout: 30_000 }) + return true + } catch { + return false + } + } + + // Worker 1 implements add() only — the test still fails (mul missing), but it commits WIP. + const r1 = await runInWorkspace( + ws, + async (cwd) => { + const src = readFileSync(join(cwd, 'solution.py'), 'utf-8').replace( + 'def add(a, b):\n raise NotImplementedError', + 'def add(a, b):\n return a + b', + ) + writeFileSync(join(cwd, 'solution.py'), src) + return { valid: runTest(cwd), value: 'w1', message: 'w1: add()' } + }, + { commitOnInvalid: true }, + ) + expect(r1.valid).toBe(false) + expect(r1.commit).toMatchObject({ ok: true }) + + // Worker 2 materializes a FRESH clone — it must already see worker 1's add(), then finish mul(). + const r2 = await runInWorkspace(ws, async (cwd) => { + const before = readFileSync(join(cwd, 'solution.py'), 'utf-8') + expect(before).toContain('return a + b') // compounding: it built on worker 1 + const src = before.replace( + 'def mul(a, b):\n raise NotImplementedError', + 'def mul(a, b):\n return a * b', + ) + writeFileSync(join(cwd, 'solution.py'), src) + return { valid: runTest(cwd), value: 'w2', message: 'w2: mul()' } + }) + expect(r2.valid).toBe(true) + expect(r2.commit).toMatchObject({ ok: true }) + }, + ) + it('returns a typed conflict instead of overwriting concurrent edits', async () => { const ws = gitWorkspace({ ref: bare }) const w1 = fresh()