tangle-network · drewstone · Jun 16, 2026 · Jun 16, 2026
diff --git a/bench/HARNESS.md b/bench/HARNESS.md
@@ -164,6 +164,17 @@ via the router, is graded by the runnable checker, and that `BenchScore` is the
 Offline plumbing test (no creds): `tsx src/gate.test.mts`. The gate runs through the SAME recursive
 atom every personified loop uses.
 
+## "Supervisor" (iterate/decompose) vs blind — through the PUBLISHED suite
+The supervisor-vs-blind gate is NOT a bespoke harness: it is `runBenchmark([sample, refine, …])`
+over an Environment. blind = `sample` (best-of-k); "supervisor" = `refine`/`sampleThenRefine`
+(depth: attempt→firewalled-analyst-steer→retry — *"a multi-agent team is just a Strategy whose driver
+spawns several agents"*). Equal compute by the substrate's CONSERVED budget; the deployable check is
+the Environment's `score`; the can't-fake-the-check firewall is built in. Run it on the HARD real
+domain via `commit0-env-run.mts` (above) or the toy `strategy-demo.mts` (offline). The LLM
+agent-driver (an LLM that itself decides spawns via the coordination MCP) is the SEPARATE product
+path — `atom-mcp-e2e.mts` / `atom-commit0.mts` — not a strategy. Evolve any strategy on a frozen
+holdout with `runStrategyEvolution`.
+
 ## Generate a fresh corpus + gate it
 The rollout generators now live with their domains: the recursive gate
 (`gate-cli.mts`) and the optimization-suite env runs (`commit0-env-run.mts`,

diff --git a/bench/src/atom-humaneval.mts b/bench/src/atom-humaneval.mts
@@ -189,7 +189,7 @@ async function driveTask(
   })
   const tree = await journal.loadTree(runId)
   const tokens = (tree ?? [])
-    .filter((e): e is Extract<(typeof tree)[number], { kind: 'settled' }> => e.kind === 'settled')
+    .filter((e): e is Extract<NonNullable<typeof tree>[number], { kind: 'settled' }> => e.kind === 'settled')
     .reduce((s, e) => s + e.spent.tokens.input + e.spent.tokens.output, 0)
   const replay = renderReplayHtml(recorder.timeline(runId), {
     title: `${task.taskId} · driver=${driverCfg.model}`,

diff --git a/bench/src/atom-mcp-e2e.mts b/bench/src/atom-mcp-e2e.mts
@@ -13,7 +13,7 @@
  */
 
 import { execFileSync } from 'node:child_process'
-import { cpSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
+import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
 import { tmpdir } from 'node:os'
 import { dirname, join } from 'node:path'
 import { fileURLToPath } from 'node:url'
@@ -26,9 +26,12 @@ import {
   createSupervisor,
   type Executor,
   type ExecutorResult,
+  gitWorkspace,
   InMemoryResultBlobStore,
   InMemorySpawnJournal,
+  runInWorkspace,
   type Scope,
+  type Workspace,
 } from '../../src/runtime/index'
 import { asAuthoredProfile } from '../../src/runtime/supervise/authoring'
 import { serveCoordinationMcp } from '../../src/runtime/supervise/coordination-mcp'
@@ -41,14 +44,28 @@ const SKILL_MD = readFileSync(join(REPO, 'skills', 'supervise', 'SKILL.md'), 'ut
 
 const TASK = 'In solution.py, implement add(a, b) so it returns the sum a + b and test_solution.py passes.'
 
-function makeTaskTemplate(): string {
-  const dir = mkdtempSync(join(tmpdir(), 'e2e-task-'))
-  writeFileSync(join(dir, 'solution.py'), 'def add(a, b):\n    raise NotImplementedError\n')
+/** Seed a bare git repo with the failing task — the SHARED workspace ref every worker clones. */
+function seedWorkspaceRepo(): string {
+  const git = (args: string[], cwd?: string): void => {
+    execFileSync('git', ['-c', 'core.hooksPath=/dev/null', '-c', 'user.email=t@t', '-c', 'user.name=t', ...args], {
+      cwd,
+      stdio: 'pipe',
+    })
+  }
+  const bare = `${mkdtempSync(join(tmpdir(), 'e2e-ws-'))}.git`
+  git(['init', '--bare', '-b', 'main', bare])
+  const seed = mkdtempSync(join(tmpdir(), 'e2e-seed-'))
+  git(['clone', bare, seed])
+  writeFileSync(join(seed, 'solution.py'), 'def add(a, b):\n    raise NotImplementedError\n')
   writeFileSync(
-    join(dir, 'test_solution.py'),
+    join(seed, 'test_solution.py'),
     'from solution import add\nassert add(2, 3) == 5\nassert add(-1, 1) == 0\nassert add(0, 0) == 0\nprint("PASS")\n',
   )
-  return dir
+  git(['add', '-A'], seed)
+  git(['commit', '-m', 'task'], seed)
+  git(['push', 'origin', 'main'], seed)
+  rmSync(seed, { recursive: true, force: true })
+  return bare
 }
 
 /** The deployable check: run the test in the worker's cwd. Exit 0 = delivered. No LLM judge. */
@@ -83,35 +100,41 @@ async function bridgeChat(opts: {
 
 const transcripts: Array<{ who: string; said: string; delivered?: boolean }> = []
 
-/** A WORKER = a real opencode coding session in its OWN cwd, graded by the real test. */
-function makeWorker(rawProfile: unknown, templateDir: string, n: number): Agent<unknown, unknown> {
+/** A WORKER = a real opencode coding session in a clone of the SHARED workspace, graded by the
+ *  real test; its delivery is committed back so the next worker builds on it (not isolated). */
+function makeWorker(rawProfile: unknown, ws: Workspace, n: number): Agent<unknown, unknown> {
   const p = asAuthoredProfile(rawProfile)
   const name = p?.name ?? `worker-${n}`
   let artifact: ExecutorResult<unknown> | undefined
   const inner: Executor<unknown> = {
     runtime: 'router',
     async execute() {
-      const cwd = mkdtempSync(join(tmpdir(), 'e2e-worker-'))
-      cpSync(templateDir, cwd, { recursive: true })
       const sys = p?.systemPrompt ?? TASK
-      const said = await bridgeChat({
-        messages: [
-          {
-            role: 'user',
-            content: `${sys}\n\nYou are working in the current directory. Edit the files so that running \`python3 test_solution.py\` prints PASS. Do it now.`,
-          },
-        ],
-        cwd,
-      })
-      const delivered = checkPasses(cwd)
-      transcripts.push({ who: name, said: said.slice(0, 300), delivered })
+      const run = await runInWorkspace(
+        ws,
+        async (cwd) => {
+          const said = await bridgeChat({
+            messages: [
+              {
+                role: 'user',
+                content: `${sys}\n\nYou are working in the current directory (it already holds prior workers' committed progress). Edit the files so that running \`python3 test_solution.py\` prints PASS. Do it now.`,
+              },
+            ],
+            cwd,
+          })
+          const valid = checkPasses(cwd)
+          transcripts.push({ who: name, said: said.slice(0, 300), delivered: valid })
+          return { valid, value: said.slice(0, 120), message: `${name}: ${valid ? 'delivered' : 'wip'}` }
+        },
+        { tmpPrefix: 'e2e-worker-', commitOnInvalid: true },
+      )
+      const delivered = run.valid
       artifact = {
         outRef: contentAddress(`${name}:${delivered}`),
-        out: { worker: name, delivered, profileSystemPrompt: sys.slice(0, 120) },
+        out: { worker: name, delivered, rev: run.commit?.ok ? run.commit.rev : undefined, profileSystemPrompt: sys.slice(0, 120) },
         verdict: { valid: delivered, score: delivered ? 1 : 0 },
         spent: { iterations: 1, tokens: { input: 0, output: 0 }, usd: 0, ms: 0 },
       }
-      rmSync(cwd, { recursive: true, force: true })
       return artifact
     },
     teardown: () => Promise.resolve({ destroyed: true }),
@@ -125,8 +148,9 @@ function makeWorker(rawProfile: unknown, templateDir: string, n: number): Agent<
 }
 
 async function main(): Promise<void> {
-  console.log(`atom-mcp-e2e: model=${MODEL}  (real boxes, real MCP, real test)`)
-  const templateDir = makeTaskTemplate()
+  console.log(`atom-mcp-e2e: model=${MODEL}  (real boxes, real MCP, real test, shared workspace)`)
+  const bareRef = seedWorkspaceRepo()
+  const ws = gitWorkspace({ ref: bareRef })
   const blobs = new InMemoryResultBlobStore()
   let n = 0
 
@@ -136,7 +160,7 @@ async function main(): Promise<void> {
       const mcp = await serveCoordinationMcp({
         scope,
         blobs,
-        makeWorkerAgent: (raw) => makeWorker(raw, templateDir, n++),
+        makeWorkerAgent: (raw) => makeWorker(raw, ws, n++),
         perWorker: { maxIterations: 2, maxTokens: 200_000 },
       })
       // The supervisor's cwd carries the REAL skill file (opencode loads it from the cwd skill dirs).
@@ -178,7 +202,7 @@ async function main(): Promise<void> {
     maxDepth: 4,
     now: () => Date.now(),
   })
-  rmSync(templateDir, { recursive: true, force: true })
+  rmSync(bareRef, { recursive: true, force: true })
 
   console.log('\n── transcripts (real driver↔worker) ──')
   for (const t of transcripts) {

diff --git a/docs/research/interactive-sessions-spec.md b/docs/research/interactive-sessions-spec.md
@@ -0,0 +1,75 @@
+# Spec — interactive (tmux) harness sessions + live streaming
+
+**Vision (one sentence):** instead of headless one-shot CLI calls, each agent in a supervised run is a **live, interactive harness session in its own tmux window** (driveable, observable, resumable), the whole agent tree is one tmux session, and it streams to a browser — composing with the recorded animated replay.
+
+**Why now:** the whole real chain already delivers — an opencode supervisor drives opencode workers via the coordination MCP, a real deployable check gates delivery (`bench/src/atom-mcp-e2e.mts`, `972707f`). What's missing is (a) the agents run *headless* (one prompt → output), so you can't watch or interact, and (b) the harness-specific glue lives in a bench script, not the substrate. This spec turns both into a real, generalized capability.
+
+## Placement — who owns what (obeys the AgentProfile law + the layering)
+
+The law: *an agent IS its AgentProfile; you change behavior by authoring the profile and letting the substrate materialize it — never specialize the runtime to a harness.* That decides the split cleanly:
+
+| Layer | Owns | Why |
+|---|---|---|
+| **agent-runtime** (this repo) | The **recursion + the ports**: the coordination MCP over the Scope (`serveCoordinationMcp`, done), a generic **`session` Executor** that opens/drives/observes a session via the substrate's API (NOT tmux-aware), the shared `Workspace` seam, the journal→replay. | The runtime stays harness-agnostic. It drives; it never spawns tmux or knows what opencode is. |
+| **agent-dev-container** (adc) | The **materialization**: given an `AgentProfile` + cwd + mcp config, stand up the harness as an **interactive tmux window** (the TUI, not `run`), materialize the FULL profile (skills as real SKILL.md files, tools, model, mcp), capture (`pipe-pane`) + stream (`ttyd`). Exposes a **session API** (create / send / observe / status / kill). | "the container where the agents actually live" — Drew. This is the harness-specific layer; it belongs in the substrate, never the runtime. |
+| **cli-bridge** | Stays the *headless* harness materializer (the test target + the fast path). Optionally grows the same session API for local runs. | Already proven; the adc is the richer/interactive home. |
+| **sandbox SDK** | The `AgentProfile` manifest + box abstraction the adc is a flavor of. | Where the profile shape + `resources.skills` materialization already live. |
+
+**The seam** = a small **session API** the adc exposes and the runtime's `session` Executor consumes:
+`POST /sessions {profile, cwd, mcp} → {id, ttydUrl}` · `POST /sessions/:id/send {text}` · `GET /sessions/:id/stream` (SSE: harness output + a done/settle signal) · `GET /sessions/:id/status` · `DELETE /sessions/:id`. The runtime drives the recursion through the coordination MCP; the substrate drives the *harness* through this API.
+
+## Where the issue goes
+- **Primary issue → `tangle-network/agent-dev-container`** (the materialization + the session API + ttyd). This spec is the design ref.
+- **Companion issue → `tangle-network/agent-runtime`** (the generic `session` Executor + the shared `Workspace` wiring + replay-compose). Small; mostly the executor seam.
+- **Track on `ops-board`** (lane: eng, owner: claude) with measurable done-criteria = the e2e checklist below.
+
+## End-to-end checklist (the map to "done")
+
+### Phase 0 — preconditions (DONE)
+- [x] Coordination MCP over a live Scope (`serveCoordinationMcp`, real test).
+- [x] Proof a coding harness mounts + calls it (`mcp-mount-probe`).
+- [x] Whole headless e2e delivers (`atom-mcp-e2e`).
+- [x] Standard `skills/supervise/SKILL.md`.
+
+### Phase 1 — substrate: AgentProfile materialization (adc + bridge)  *(Drew's "materialize the entire profile")*
+- [ ] Materialize `resources.skills` as real `SKILL.md` files in the harness skill dir (opencode `~/.config/opencode/skill/` + project `.opencode/skill/`; verify the exact dir per harness) — loaded natively, NOT a prompt note.
+- [ ] Materialize tools, model, system prompt, mcp (mcp already works — `type:'http'`).
+- [ ] One `materializeAgentProfile(profile, dir)` per harness; remove the bench script's cwd-writes.
+- [ ] Exit: a profile with a skill drives behavior with zero prompt-stuffing (probe: agent uses a skill it was never told about in the prompt).
+
+### Phase 2 — substrate: interactive tmux session + session API (adc)
+- [ ] `tmux new-session`/`new-window` per run/agent; run the harness in **interactive** mode (TUI), one window per agent, named by agent id.
+- [ ] Drive: send the prompt (send-keys or the harness's stdin protocol); detect completion (harness done-signal / sentinel) → emit a settle event.
+- [ ] Capture: `pipe-pane` → a transcript stream (for the journal).
+- [ ] The session API (create/send/stream/status/kill) over HTTP.
+- [ ] Resource governance: max concurrent windows, per-session timeout, cleanup on settle/crash.
+- [ ] Exit: `POST /sessions` with a profile → a live tmux window you can `tmux attach` to; `/stream` yields output + a done signal.
+
+### Phase 3 — runtime: the generic `session` Executor (agent-runtime)
+- [ ] A `session` backend on the `Executor` port: `execute` calls the substrate session API (create → send task → stream until done) and settles with the result; `deliver` → `/send` (steer); `teardown` → `/kill`. Harness-agnostic.
+- [ ] Wire `makeWorkerAgent` (coordination MCP) → the `session` executor, selected by the worker's `AgentProfile.backend`.
+- [ ] Exit: `spawn_worker` → a worker that runs as a live interactive session, settles on its deployable check.
+
+### Phase 4 — shared workspace (agent-runtime)  *(the e2e's open design point)*
+- [ ] Supervisor + its workers share ONE `Workspace` (gitWorkspace) — workers branch/worktree, deliver back so the supervisor (and the next worker) build on one artifact. Fixes the "files missing" confusion.
+- [ ] Exit: a 2-worker run where worker-2 builds on worker-1's committed output.
+
+### Phase 5 — streaming + viz (adc + the viewer)
+- [ ] `ttyd` serves the run's tmux session over a websocket; auth (bearer); a stable URL per run.
+- [ ] A viewer page: the live tmux stream (now) beside the **animated replay** (the recorded tree) + the topology — one screen, live + history.
+- [ ] Exit: open the URL, watch the supervisor + worker panes work in real time; scrub the replay after.
+
+### Phase 6 — prove it e2e (no mock)
+- [ ] The whole chain on interactive sessions: supervisor (tmux) authors profiles → `spawn_worker` → worker (tmux) codes in the shared workspace → real test gates → delivered — all streamed live, all journaled, replayable.
+- [ ] Retire `atom-mcp-e2e`'s harness-specific shortcuts (now: author profiles, the substrate materializes).
+- [ ] Exit: a recorded run URL + the replay + green deployable check.
+
+## Open design points (decide during Phase 2–4)
+- **Interactive vs headless harness mode:** does opencode/claude-code expose a driveable interactive TUI, or do we run `run` *inside* the pane for the live-output view? (Headless-in-a-pane is the cheap first cut; true interactive is the goal.)
+- **Completion detection** in a TUI (sentinel vs a harness done event).
+- **Session lifecycle:** resume after a crash (the journal already supports replay/resume — extend to sessions).
+- **Security:** ttyd exposure + the coordination MCP exposure (bind localhost / authd tunnel).
+- **Concurrency:** N agents = N windows; the adc's resource limits.
+
+## Net
+The runtime is essentially done for this (coordination MCP + the executor port + replay). The new work is a **substrate capability in the adc** (interactive tmux sessions + full-profile materialization + ttyd), reached through one small session API and one generic `session` executor in the runtime. Nothing here specializes the runtime to a harness.
diff --git a/skills/build-with-agent-runtime/SKILL.md b/skills/build-with-agent-runtime/SKILL.md
@@ -68,7 +68,7 @@ signature + the exact "do NOT build".
 | **Gate: ship/hold from a `BenchmarkReport`** (per-task cells) | `promotionGate({ report, incumbent, candidate })` — `/runtime` | canonical-api §3.4 |
 | **Run the full multi-generation flywheel + certify** | `runStrategyEvolution(config)` — `/runtime` | canonical-api §3.4 |
 | **Compose the prod sandbox profile** (eval/prod parity) | `composeProductionAgentProfile(base, opts)` — `/mcp` | canonical-api §3.2 |
-| **Observe a run** (cost/time waterfall, live tree, OTLP) | `createWaterfallCollector` / `createTopologyView` / `createOtelExporter` via `composeRuntimeHooks(...)` — root | canonical-api §3.5 |
+| **Observe a run** (cost/time waterfall, live tree, OTLP) | `createWaterfallCollector` / `createOtelExporter` via `composeRuntimeHooks(...)` — root; `createTopologyView` / `renderTopologyTree` — `/topology` | canonical-api §3.5 |
 | **State any A/B claim** | `pairedLift` (bench) over `pairedBootstrap`/`heldoutSignificance` (substrate) | canonical-api §3.5 |
 | **Observe/ship with billing-boundary** | `withTangleIntelligence(agent, { project, effort })` — `/intelligence` | canonical-api §7 (now live on main — verify) |
 

diff --git a/src/runtime/index.ts b/src/runtime/index.ts
@@ -311,6 +311,9 @@ export {
   isDriverSpec,
   withDriverExecutor,
 } from './supervise/driver-executor'
+// Supervisor-as-MCP: serve the coordination verbs as a real HTTP MCP over a live Scope, so any
+// harness (claude-code / codex / opencode) BECOMES the supervisor by mounting one MCP server.
+export { type CoordinationMcpHandle, serveCoordinationMcp } from './supervise/coordination-mcp'
 // The ONE built-in executor entrypoint: backend-as-data (`createExecutor({backend})`).
 // The per-backend factories are internal case-arms; BYO agents implement `Executor`.
 export {
@@ -407,7 +410,9 @@ export {
   gitWorkspace,
   jjWorkspace,
   localShell,
+  runInWorkspace,
   type Shell,
   type Workspace,
   type WorkspaceCommit,
+  type WorkspaceRun,
 } from './workspace'
diff --git a/src/runtime/workspace.ts b/src/runtime/workspace.ts
@@ -132,6 +132,43 @@ export function jjWorkspace(opts: GitWorkspaceOptions): Workspace {
   }
 }
 
+export interface WorkspaceRun<T> {
+  readonly valid: boolean
+  readonly value: T
+  /** Present when a commit was attempted (valid, or `commitOnInvalid`). */
+  readonly commit?: WorkspaceCommit
+}
+
+/**
+ * Run a worker `body` inside a FRESH clone of a shared `Workspace`, then commit its work back
+ * so the next worker (or the supervisor) builds on it. This is the seam that turns isolated
+ * per-worker cwds into one compounding artifact — `body` gets a real materialized dir, its
+ * delivery is committed to the shared ref iff it's valid (a conflict is returned, never thrown).
+ * The clone is removed after; durable state lives only in the ref.
+ */
+export async function runInWorkspace<T>(
+  ws: Workspace,
+  body: (cwd: string) => Promise<{ valid: boolean; value: T; message?: string }>,
+  opts: { tmpPrefix?: string; commitOnInvalid?: boolean } = {},
+): Promise<WorkspaceRun<T>> {
+  const { mkdtempSync, rmSync } = await import('node:fs')
+  const { tmpdir } = await import('node:os')
+  const { join } = await import('node:path')
+  const dir = mkdtempSync(join(tmpdir(), opts.tmpPrefix ?? 'ws-run-'))
+  try {
+    await ws.materialize(dir)
+    const r = await body(dir)
+    if (r.valid || opts.commitOnInvalid) {
+      const message = r.message ?? (r.valid ? 'worker: delivered' : 'worker: wip')
+      const commit = await ws.commit(dir, message)
+      return { valid: r.valid, value: r.value, commit }
+    }
+    return { valid: r.valid, value: r.value }
+  } finally {
+    rmSync(dir, { recursive: true, force: true })
+  }
+}
+
 function tail(s: string): string {
   return s.slice(-400)
 }