Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions bench/HARNESS.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,17 @@ via the router, is graded by the runnable checker, and that `BenchScore` is the
Offline plumbing test (no creds): `tsx src/gate.test.mts`. The gate runs through the SAME recursive
atom every personified loop uses.

## "Supervisor" (iterate/decompose) vs blind — through the PUBLISHED suite
The supervisor-vs-blind gate is NOT a bespoke harness: it is `runBenchmark([sample, refine, …])`
over an Environment. blind = `sample` (best-of-k); "supervisor" = `refine`/`sampleThenRefine`
(depth: attempt→firewalled-analyst-steer→retry — *"a multi-agent team is just a Strategy whose driver
spawns several agents"*). Equal compute by the substrate's CONSERVED budget; the deployable check is
the Environment's `score`; the can't-fake-the-check firewall is built in. Run it on the HARD real
domain via `commit0-env-run.mts` (above) or the toy `strategy-demo.mts` (offline). The LLM
agent-driver (an LLM that itself decides spawns via the coordination MCP) is the SEPARATE product
path — `atom-mcp-e2e.mts` / `atom-commit0.mts` — not a strategy. Evolve any strategy on a frozen
holdout with `runStrategyEvolution`.

## Generate a fresh corpus + gate it
The rollout generators now live with their domains: the recursive gate
(`gate-cli.mts`) and the optimization-suite env runs (`commit0-env-run.mts`,
Expand Down
2 changes: 1 addition & 1 deletion bench/src/atom-humaneval.mts
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ async function driveTask(
})
const tree = await journal.loadTree(runId)
const tokens = (tree ?? [])
.filter((e): e is Extract<(typeof tree)[number], { kind: 'settled' }> => e.kind === 'settled')
.filter((e): e is Extract<NonNullable<typeof tree>[number], { kind: 'settled' }> => e.kind === 'settled')
.reduce((s, e) => s + e.spent.tokens.input + e.spent.tokens.output, 0)
const replay = renderReplayHtml(recorder.timeline(runId), {
title: `${task.taskId} · driver=${driverCfg.model}`,
Expand Down
78 changes: 51 additions & 27 deletions bench/src/atom-mcp-e2e.mts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
*/

import { execFileSync } from 'node:child_process'
import { cpSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
import { tmpdir } from 'node:os'
import { dirname, join } from 'node:path'
import { fileURLToPath } from 'node:url'
Expand All @@ -26,9 +26,12 @@ import {
createSupervisor,
type Executor,
type ExecutorResult,
gitWorkspace,
InMemoryResultBlobStore,
InMemorySpawnJournal,
runInWorkspace,
type Scope,
type Workspace,
} from '../../src/runtime/index'
import { asAuthoredProfile } from '../../src/runtime/supervise/authoring'
import { serveCoordinationMcp } from '../../src/runtime/supervise/coordination-mcp'
Expand All @@ -41,14 +44,28 @@ const SKILL_MD = readFileSync(join(REPO, 'skills', 'supervise', 'SKILL.md'), 'ut

const TASK = 'In solution.py, implement add(a, b) so it returns the sum a + b and test_solution.py passes.'

function makeTaskTemplate(): string {
const dir = mkdtempSync(join(tmpdir(), 'e2e-task-'))
writeFileSync(join(dir, 'solution.py'), 'def add(a, b):\n raise NotImplementedError\n')
/** Seed a bare git repo with the failing task — the SHARED workspace ref every worker clones. */
function seedWorkspaceRepo(): string {
const git = (args: string[], cwd?: string): void => {
execFileSync('git', ['-c', 'core.hooksPath=/dev/null', '-c', 'user.email=t@t', '-c', 'user.name=t', ...args], {
cwd,
stdio: 'pipe',
})
}
const bare = `${mkdtempSync(join(tmpdir(), 'e2e-ws-'))}.git`
git(['init', '--bare', '-b', 'main', bare])
const seed = mkdtempSync(join(tmpdir(), 'e2e-seed-'))
git(['clone', bare, seed])
writeFileSync(join(seed, 'solution.py'), 'def add(a, b):\n raise NotImplementedError\n')
writeFileSync(
join(dir, 'test_solution.py'),
join(seed, 'test_solution.py'),
'from solution import add\nassert add(2, 3) == 5\nassert add(-1, 1) == 0\nassert add(0, 0) == 0\nprint("PASS")\n',
)
return dir
git(['add', '-A'], seed)
git(['commit', '-m', 'task'], seed)
git(['push', 'origin', 'main'], seed)
rmSync(seed, { recursive: true, force: true })
return bare
}

/** The deployable check: run the test in the worker's cwd. Exit 0 = delivered. No LLM judge. */
Expand Down Expand Up @@ -83,35 +100,41 @@ async function bridgeChat(opts: {

const transcripts: Array<{ who: string; said: string; delivered?: boolean }> = []

/** A WORKER = a real opencode coding session in its OWN cwd, graded by the real test. */
function makeWorker(rawProfile: unknown, templateDir: string, n: number): Agent<unknown, unknown> {
/** A WORKER = a real opencode coding session in a clone of the SHARED workspace, graded by the
* real test; its delivery is committed back so the next worker builds on it (not isolated). */
function makeWorker(rawProfile: unknown, ws: Workspace, n: number): Agent<unknown, unknown> {
const p = asAuthoredProfile(rawProfile)
const name = p?.name ?? `worker-${n}`
let artifact: ExecutorResult<unknown> | undefined
const inner: Executor<unknown> = {
runtime: 'router',
async execute() {
const cwd = mkdtempSync(join(tmpdir(), 'e2e-worker-'))
cpSync(templateDir, cwd, { recursive: true })
const sys = p?.systemPrompt ?? TASK
const said = await bridgeChat({
messages: [
{
role: 'user',
content: `${sys}\n\nYou are working in the current directory. Edit the files so that running \`python3 test_solution.py\` prints PASS. Do it now.`,
},
],
cwd,
})
const delivered = checkPasses(cwd)
transcripts.push({ who: name, said: said.slice(0, 300), delivered })
const run = await runInWorkspace(
ws,
async (cwd) => {
const said = await bridgeChat({
messages: [
{
role: 'user',
content: `${sys}\n\nYou are working in the current directory (it already holds prior workers' committed progress). Edit the files so that running \`python3 test_solution.py\` prints PASS. Do it now.`,
},
],
cwd,
})
const valid = checkPasses(cwd)
transcripts.push({ who: name, said: said.slice(0, 300), delivered: valid })
return { valid, value: said.slice(0, 120), message: `${name}: ${valid ? 'delivered' : 'wip'}` }
},
{ tmpPrefix: 'e2e-worker-', commitOnInvalid: true },
)
const delivered = run.valid
artifact = {
outRef: contentAddress(`${name}:${delivered}`),
out: { worker: name, delivered, profileSystemPrompt: sys.slice(0, 120) },
out: { worker: name, delivered, rev: run.commit?.ok ? run.commit.rev : undefined, profileSystemPrompt: sys.slice(0, 120) },
verdict: { valid: delivered, score: delivered ? 1 : 0 },
spent: { iterations: 1, tokens: { input: 0, output: 0 }, usd: 0, ms: 0 },
}
rmSync(cwd, { recursive: true, force: true })
return artifact
},
teardown: () => Promise.resolve({ destroyed: true }),
Expand All @@ -125,8 +148,9 @@ function makeWorker(rawProfile: unknown, templateDir: string, n: number): Agent<
}

async function main(): Promise<void> {
console.log(`atom-mcp-e2e: model=${MODEL} (real boxes, real MCP, real test)`)
const templateDir = makeTaskTemplate()
console.log(`atom-mcp-e2e: model=${MODEL} (real boxes, real MCP, real test, shared workspace)`)
const bareRef = seedWorkspaceRepo()
const ws = gitWorkspace({ ref: bareRef })
const blobs = new InMemoryResultBlobStore()
let n = 0

Expand All @@ -136,7 +160,7 @@ async function main(): Promise<void> {
const mcp = await serveCoordinationMcp({
scope,
blobs,
makeWorkerAgent: (raw) => makeWorker(raw, templateDir, n++),
makeWorkerAgent: (raw) => makeWorker(raw, ws, n++),
perWorker: { maxIterations: 2, maxTokens: 200_000 },
})
// The supervisor's cwd carries the REAL skill file (opencode loads it from the cwd skill dirs).
Expand Down Expand Up @@ -178,7 +202,7 @@ async function main(): Promise<void> {
maxDepth: 4,
now: () => Date.now(),
})
rmSync(templateDir, { recursive: true, force: true })
rmSync(bareRef, { recursive: true, force: true })

console.log('\n── transcripts (real driver↔worker) ──')
for (const t of transcripts) {
Expand Down
75 changes: 75 additions & 0 deletions docs/research/interactive-sessions-spec.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Spec — interactive (tmux) harness sessions + live streaming

**Vision (one sentence):** instead of headless one-shot CLI calls, each agent in a supervised run is a **live, interactive harness session in its own tmux window** (driveable, observable, resumable), the whole agent tree is one tmux session, and it streams to a browser — composing with the recorded animated replay.

**Why now:** the whole real chain already delivers — an opencode supervisor drives opencode workers via the coordination MCP, a real deployable check gates delivery (`bench/src/atom-mcp-e2e.mts`, `972707f`). What's missing is (a) the agents run *headless* (one prompt → output), so you can't watch or interact, and (b) the harness-specific glue lives in a bench script, not the substrate. This spec turns both into a real, generalized capability.

## Placement — who owns what (obeys the AgentProfile law + the layering)

The law: *an agent IS its AgentProfile; you change behavior by authoring the profile and letting the substrate materialize it — never specialize the runtime to a harness.* That decides the split cleanly:

| Layer | Owns | Why |
|---|---|---|
| **agent-runtime** (this repo) | The **recursion + the ports**: the coordination MCP over the Scope (`serveCoordinationMcp`, done), a generic **`session` Executor** that opens/drives/observes a session via the substrate's API (NOT tmux-aware), the shared `Workspace` seam, the journal→replay. | The runtime stays harness-agnostic. It drives; it never spawns tmux or knows what opencode is. |
| **agent-dev-container** (adc) | The **materialization**: given an `AgentProfile` + cwd + mcp config, stand up the harness as an **interactive tmux window** (the TUI, not `run`), materialize the FULL profile (skills as real SKILL.md files, tools, model, mcp), capture (`pipe-pane`) + stream (`ttyd`). Exposes a **session API** (create / send / observe / status / kill). | "the container where the agents actually live" — Drew. This is the harness-specific layer; it belongs in the substrate, never the runtime. |
| **cli-bridge** | Stays the *headless* harness materializer (the test target + the fast path). Optionally grows the same session API for local runs. | Already proven; the adc is the richer/interactive home. |
| **sandbox SDK** | The `AgentProfile` manifest + box abstraction the adc is a flavor of. | Where the profile shape + `resources.skills` materialization already live. |

**The seam** = a small **session API** the adc exposes and the runtime's `session` Executor consumes:
`POST /sessions {profile, cwd, mcp} → {id, ttydUrl}` · `POST /sessions/:id/send {text}` · `GET /sessions/:id/stream` (SSE: harness output + a done/settle signal) · `GET /sessions/:id/status` · `DELETE /sessions/:id`. The runtime drives the recursion through the coordination MCP; the substrate drives the *harness* through this API.

## Where the issue goes
- **Primary issue → `tangle-network/agent-dev-container`** (the materialization + the session API + ttyd). This spec is the design ref.
- **Companion issue → `tangle-network/agent-runtime`** (the generic `session` Executor + the shared `Workspace` wiring + replay-compose). Small; mostly the executor seam.
- **Track on `ops-board`** (lane: eng, owner: claude) with measurable done-criteria = the e2e checklist below.

## End-to-end checklist (the map to "done")

### Phase 0 — preconditions (DONE)
- [x] Coordination MCP over a live Scope (`serveCoordinationMcp`, real test).
- [x] Proof a coding harness mounts + calls it (`mcp-mount-probe`).
- [x] Whole headless e2e delivers (`atom-mcp-e2e`).
- [x] Standard `skills/supervise/SKILL.md`.

### Phase 1 — substrate: AgentProfile materialization (adc + bridge) *(Drew's "materialize the entire profile")*
- [ ] Materialize `resources.skills` as real `SKILL.md` files in the harness skill dir (opencode `~/.config/opencode/skill/` + project `.opencode/skill/`; verify the exact dir per harness) — loaded natively, NOT a prompt note.
- [ ] Materialize tools, model, system prompt, mcp (mcp already works — `type:'http'`).
- [ ] One `materializeAgentProfile(profile, dir)` per harness; remove the bench script's cwd-writes.
- [ ] Exit: a profile with a skill drives behavior with zero prompt-stuffing (probe: agent uses a skill it was never told about in the prompt).

### Phase 2 — substrate: interactive tmux session + session API (adc)
- [ ] `tmux new-session`/`new-window` per run/agent; run the harness in **interactive** mode (TUI), one window per agent, named by agent id.
- [ ] Drive: send the prompt (send-keys or the harness's stdin protocol); detect completion (harness done-signal / sentinel) → emit a settle event.
- [ ] Capture: `pipe-pane` → a transcript stream (for the journal).
- [ ] The session API (create/send/stream/status/kill) over HTTP.
- [ ] Resource governance: max concurrent windows, per-session timeout, cleanup on settle/crash.
- [ ] Exit: `POST /sessions` with a profile → a live tmux window you can `tmux attach` to; `/stream` yields output + a done signal.

### Phase 3 — runtime: the generic `session` Executor (agent-runtime)
- [ ] A `session` backend on the `Executor` port: `execute` calls the substrate session API (create → send task → stream until done) and settles with the result; `deliver` → `/send` (steer); `teardown` → `/kill`. Harness-agnostic.
- [ ] Wire `makeWorkerAgent` (coordination MCP) → the `session` executor, selected by the worker's `AgentProfile.backend`.
- [ ] Exit: `spawn_worker` → a worker that runs as a live interactive session, settles on its deployable check.

### Phase 4 — shared workspace (agent-runtime) *(the e2e's open design point)*
- [ ] Supervisor + its workers share ONE `Workspace` (gitWorkspace) — workers branch/worktree, deliver back so the supervisor (and the next worker) build on one artifact. Fixes the "files missing" confusion.
- [ ] Exit: a 2-worker run where worker-2 builds on worker-1's committed output.

### Phase 5 — streaming + viz (adc + the viewer)
- [ ] `ttyd` serves the run's tmux session over a websocket; auth (bearer); a stable URL per run.
- [ ] A viewer page: the live tmux stream (now) beside the **animated replay** (the recorded tree) + the topology — one screen, live + history.
- [ ] Exit: open the URL, watch the supervisor + worker panes work in real time; scrub the replay after.

### Phase 6 — prove it e2e (no mock)
- [ ] The whole chain on interactive sessions: supervisor (tmux) authors profiles → `spawn_worker` → worker (tmux) codes in the shared workspace → real test gates → delivered — all streamed live, all journaled, replayable.
- [ ] Retire `atom-mcp-e2e`'s harness-specific shortcuts (now: author profiles, the substrate materializes).
- [ ] Exit: a recorded run URL + the replay + green deployable check.

## Open design points (decide during Phase 2–4)
- **Interactive vs headless harness mode:** does opencode/claude-code expose a driveable interactive TUI, or do we run `run` *inside* the pane for the live-output view? (Headless-in-a-pane is the cheap first cut; true interactive is the goal.)
- **Completion detection** in a TUI (sentinel vs a harness done event).
- **Session lifecycle:** resume after a crash (the journal already supports replay/resume — extend to sessions).
- **Security:** ttyd exposure + the coordination MCP exposure (bind localhost / authd tunnel).
- **Concurrency:** N agents = N windows; the adc's resource limits.

## Net
The runtime is essentially done for this (coordination MCP + the executor port + replay). The new work is a **substrate capability in the adc** (interactive tmux sessions + full-profile materialization + ttyd), reached through one small session API and one generic `session` executor in the runtime. Nothing here specializes the runtime to a harness.
2 changes: 1 addition & 1 deletion skills/build-with-agent-runtime/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ signature + the exact "do NOT build".
| **Gate: ship/hold from a `BenchmarkReport`** (per-task cells) | `promotionGate({ report, incumbent, candidate })` — `/runtime` | canonical-api §3.4 |
| **Run the full multi-generation flywheel + certify** | `runStrategyEvolution(config)` — `/runtime` | canonical-api §3.4 |
| **Compose the prod sandbox profile** (eval/prod parity) | `composeProductionAgentProfile(base, opts)` — `/mcp` | canonical-api §3.2 |
| **Observe a run** (cost/time waterfall, live tree, OTLP) | `createWaterfallCollector` / `createTopologyView` / `createOtelExporter` via `composeRuntimeHooks(...)` — root | canonical-api §3.5 |
| **Observe a run** (cost/time waterfall, live tree, OTLP) | `createWaterfallCollector` / `createOtelExporter` via `composeRuntimeHooks(...)` — root; `createTopologyView` / `renderTopologyTree` — `/topology` | canonical-api §3.5 |
| **State any A/B claim** | `pairedLift` (bench) over `pairedBootstrap`/`heldoutSignificance` (substrate) | canonical-api §3.5 |
| **Observe/ship with billing-boundary** | `withTangleIntelligence(agent, { project, effort })` — `/intelligence` | canonical-api §7 (now live on main — verify) |

Expand Down
5 changes: 5 additions & 0 deletions src/runtime/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,9 @@ export {
isDriverSpec,
withDriverExecutor,
} from './supervise/driver-executor'
// Supervisor-as-MCP: serve the coordination verbs as a real HTTP MCP over a live Scope, so any
// harness (claude-code / codex / opencode) BECOMES the supervisor by mounting one MCP server.
export { type CoordinationMcpHandle, serveCoordinationMcp } from './supervise/coordination-mcp'
// The ONE built-in executor entrypoint: backend-as-data (`createExecutor({backend})`).
// The per-backend factories are internal case-arms; BYO agents implement `Executor`.
export {
Expand Down Expand Up @@ -407,7 +410,9 @@ export {
gitWorkspace,
jjWorkspace,
localShell,
runInWorkspace,
type Shell,
type Workspace,
type WorkspaceCommit,
type WorkspaceRun,
} from './workspace'
37 changes: 37 additions & 0 deletions src/runtime/workspace.ts
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,43 @@ export function jjWorkspace(opts: GitWorkspaceOptions): Workspace {
}
}

export interface WorkspaceRun<T> {
readonly valid: boolean
readonly value: T
/** Present when a commit was attempted (valid, or `commitOnInvalid`). */
readonly commit?: WorkspaceCommit
}

/**
* Run a worker `body` inside a FRESH clone of a shared `Workspace`, then commit its work back
* so the next worker (or the supervisor) builds on it. This is the seam that turns isolated
* per-worker cwds into one compounding artifact — `body` gets a real materialized dir, its
* delivery is committed to the shared ref iff it's valid (a conflict is returned, never thrown).
* The clone is removed after; durable state lives only in the ref.
*/
export async function runInWorkspace<T>(
ws: Workspace,
body: (cwd: string) => Promise<{ valid: boolean; value: T; message?: string }>,
opts: { tmpPrefix?: string; commitOnInvalid?: boolean } = {},
): Promise<WorkspaceRun<T>> {
const { mkdtempSync, rmSync } = await import('node:fs')
const { tmpdir } = await import('node:os')
const { join } = await import('node:path')
const dir = mkdtempSync(join(tmpdir(), opts.tmpPrefix ?? 'ws-run-'))
try {
await ws.materialize(dir)
const r = await body(dir)
if (r.valid || opts.commitOnInvalid) {
const message = r.message ?? (r.valid ? 'worker: delivered' : 'worker: wip')
const commit = await ws.commit(dir, message)
return { valid: r.valid, value: r.value, commit }
}
return { valid: r.valid, value: r.value }
} finally {
rmSync(dir, { recursive: true, force: true })
}
}

function tail(s: string): string {
return s.slice(-400)
}
Loading
Loading