Skip to content

Commit 93b10c8

Browse files
committed
feat(gastown): implement town reconciler with event-driven state management
Replace the imperative patrol/scheduling/review-queue alarm phases with a declarative reconciler that drains events, computes desired state, and applies corrective actions. This is the complete implementation of the reconciliation spec (Phases 1-5). Core changes: - Event table (town_events) with 8 event types, dual-write in all RPC handlers - 6 reconciler functions: agents, beads, review queue, convoys, GUPP, GC - applyEvent/applyAction for all event and action types - Event-only agentDone/agentCompleted (no direct mutations) - Lazy assignment for slingConvoy/startConvoy (#1249) - Container status observation pre-phase (replaces witnessPatrol) - Agent activity watermark via enriched heartbeat - Invariant checker + reconciler metrics in getAlarmStatus - Rework flow: gt_request_changes tool for refinery change requests - Refinery system prompt wired in (buildRefinerySystemPrompt) Bug fixes discovered during rollout: - Convoy landing MR cycling (reconciler created duplicates) - Multi-agent hook mutual exclusion (hookBead now unhooks stale agents) - agentCompleted no longer closes beads when gt_done wasn't called - GUPP NaN bug for agents with null last_activity_at - Container status event flood (upsert instead of insert per tick) - Rule 4 scoped to in_progress MR beads only Dead code removed: ~1,578 lines from patrol.ts, scheduling.ts, review-queue.ts, and Town.do.ts (witnessPatrol, deaconPatrol, processReviewQueue, processConvoyLandings, pollPendingPRs, etc.)
1 parent 455cddb commit 93b10c8

26 files changed

Lines changed: 3638 additions & 1962 deletions

cloudflare-gastown/container/plugin/client.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,16 @@ export class GastownClient {
116116
});
117117
}
118118

119+
async requestChanges(input: {
120+
feedback: string;
121+
files?: string[];
122+
}): Promise<{ rework_bead_id: string }> {
123+
return this.request<{ rework_bead_id: string }>(this.agentPath('/request-changes'), {
124+
method: 'POST',
125+
body: JSON.stringify(input),
126+
});
127+
}
128+
119129
async checkMail(): Promise<Mail[]> {
120130
return this.request<Mail[]>(this.agentPath('/mail'));
121131
}

cloudflare-gastown/container/plugin/tools.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,38 @@ export function createTools(client: GastownClient) {
7878
},
7979
}),
8080

81+
gt_request_changes: tool({
82+
description:
83+
'Request changes on the code you are reviewing. This creates a rework task ' +
84+
'for a polecat to address your feedback. After calling this, call gt_done to ' +
85+
'release your session. The polecat will push fixes to the same branch, and ' +
86+
'you will be re-dispatched to re-review once the rework is complete. ' +
87+
'Only available to refinery agents.',
88+
args: {
89+
feedback: tool.schema
90+
.string()
91+
.describe(
92+
'Detailed description of what needs to change. Be specific: ' +
93+
'reference file names, function names, and the exact issues found.'
94+
),
95+
files: tool.schema
96+
.array(tool.schema.string())
97+
.describe('Optional list of specific file paths that need changes')
98+
.optional(),
99+
},
100+
async execute(args) {
101+
const result = await client.requestChanges({
102+
feedback: args.feedback,
103+
files: args.files,
104+
});
105+
return (
106+
`Rework request created (bead ${result.rework_bead_id}). ` +
107+
'A polecat will be assigned to address your feedback. ' +
108+
'Call gt_done now to release your session. You will be re-dispatched to re-review once the rework is complete.'
109+
);
110+
},
111+
}),
112+
81113
gt_mail_send: tool({
82114
description:
83115
'Send a typed message to another agent in the rig. ' +

cloudflare-gastown/container/src/heartbeat.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ async function sendHeartbeats(): Promise<void> {
5353
townId: agent.townId,
5454
status: agent.status,
5555
timestamp: new Date().toISOString(),
56+
lastEventType: agent.lastEventType ?? null,
57+
lastEventAt: agent.lastEventAt ?? null,
58+
activeTools: agent.activeTools ?? [],
59+
messageCount: agent.messageCount ?? 0,
5660
};
5761

5862
try {

cloudflare-gastown/container/src/process-manager.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,8 @@ async function subscribeToEvents(
449449
if (sessionID && sessionID !== agent.sessionId) continue;
450450

451451
agent.lastActivityAt = new Date().toISOString();
452+
agent.lastEventType = event.type ?? 'unknown';
453+
agent.lastEventAt = new Date().toISOString();
452454

453455
// Track active tool calls
454456
if (event.properties && 'activeTools' in event.properties) {
@@ -534,6 +536,8 @@ export async function startAgent(
534536
workdir,
535537
startedAt: now,
536538
lastActivityAt: now,
539+
lastEventType: null,
540+
lastEventAt: null,
537541
activeTools: [],
538542
messageCount: 0,
539543
exitReason: null,

cloudflare-gastown/container/src/types.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ export type ManagedAgent = {
103103
workdir: string;
104104
startedAt: string;
105105
lastActivityAt: string;
106+
/** Event type of the most recent SDK event (e.g. 'message_part.updated') */
107+
lastEventType: string | null;
108+
/** ISO 8601 timestamp of the most recent SDK event */
109+
lastEventAt: string | null;
106110
/** Last known active tool calls (populated from SSE events) */
107111
activeTools: string[];
108112
/** Total messages sent to this agent */
@@ -299,6 +303,11 @@ export type HeartbeatPayload = {
299303
townId: string;
300304
status: AgentStatus;
301305
timestamp: string;
306+
// SDK activity watermark
307+
lastEventType: string | null;
308+
lastEventAt: string | null;
309+
activeTools: string[];
310+
messageCount: number;
302311
};
303312

304313
// ── Stream ticket (for WebSocket streaming) ─────────────────────────────

cloudflare-gastown/docs/post-deploy-monitoring.md

Lines changed: 78 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,46 @@ Guide for an AI agent to verify town health after a production deploy.
44

55
## Prerequisites
66

7-
- The debug endpoint is deployed: `GET /debug/towns/:townId/status` (unauthenticated)
7+
- The debug endpoint is deployed: `GET /debug/towns/:townId/status`
8+
- The debug endpoint is protected by Cloudflare Access. Requests must include service token headers.
89
- Base URL: `https://gastown.kiloapps.io`
910
- Town ID: obtain from `GET /trpc/gastown.listOrgTowns` (requires auth) or from the user
1011

12+
### Authentication
13+
14+
The debug endpoint requires Cloudflare Access service token headers. These are the same credentials the Next.js app uses to communicate with gastown:
15+
16+
```bash
17+
# Set these from your Cloudflare Access service token
18+
export CF_ACCESS_CLIENT_ID="<service-token-client-id>"
19+
export CF_ACCESS_CLIENT_SECRET="<service-token-client-secret>"
20+
```
21+
22+
All `curl` commands in this document use a helper function that includes these headers:
23+
24+
```bash
25+
debug_curl() {
26+
curl -s \
27+
-H "CF-Access-Client-Id: $CF_ACCESS_CLIENT_ID" \
28+
-H "CF-Access-Client-Secret: $CF_ACCESS_CLIENT_SECRET" \
29+
"$@"
30+
}
31+
```
32+
1133
## 1. Monitor Script
1234

1335
The monitoring script at `scripts/monitor-town.sh` polls the debug endpoint:
1436

1537
```bash
38+
export CF_ACCESS_CLIENT_ID="<client-id>"
39+
export CF_ACCESS_CLIENT_SECRET="<client-secret>"
1640
./scripts/monitor-town.sh <townId> [interval_seconds]
1741
```
1842

1943
Or poll manually:
2044

2145
```bash
22-
curl -s "https://gastown.kiloapps.io/debug/towns/$TOWN_ID/status" | python3 -c "
46+
debug_curl "https://gastown.kiloapps.io/debug/towns/$TOWN_ID/status" | python3 -c "
2347
import sys, json
2448
d = json.load(sys.stdin)
2549
a = d['alarmStatus']
@@ -29,6 +53,9 @@ ref = [x for x in d['agentMeta'] if x.get('role') == 'refinery']
2953
if ref:
3054
r = ref[0]
3155
print(f\"Refinery: status={r['status']} hook={r.get('current_hook_bead_id') or 'NULL'}\")
56+
recon = a.get('reconciler')
57+
if recon:
58+
print(f\"Reconciler: events={recon['eventsDrained']} actions={recon['actionsEmitted']} violations={recon['invariantViolations']} wallMs={recon['wallClockMs']}\")
3259
for e in a.get('recentEvents', [])[:5]:
3360
print(f\" {e['time'][:19]} {e['message'][:80]}\")
3461
"
@@ -44,7 +71,7 @@ The Durable Object reinitializes. Check the alarm is running:
4471

4572
```bash
4673
# Alarm should show 'active (5s)' within 10s of deploy
47-
curl -s ".../debug/towns/$TOWN_ID/status" | python3 -c "
74+
debug_curl "https://gastown.kiloapps.io/debug/towns/$TOWN_ID/status" | python3 -c "
4875
import sys, json; d = json.load(sys.stdin)
4976
print(d['alarmStatus']['alarm']['intervalLabel'])
5077
"
@@ -67,23 +94,24 @@ Poll until agents start working:
6794

6895
```bash
6996
# Should see Working > 0 within 2-3 min
70-
curl -s ".../debug/towns/$TOWN_ID/status" | python3 -c "
97+
debug_curl "https://gastown.kiloapps.io/debug/towns/$TOWN_ID/status" | python3 -c "
7198
import sys, json; d = json.load(sys.stdin)
7299
print(f\"Working: {d['alarmStatus']['agents']['working']}\")
73100
"
74101
```
75102

76103
### Phase 3: Agent Recovery (1-5min)
77104

78-
Verify agents recover from the container restart:
105+
Verify agents recover from the container restart. The reconciler handles all recovery:
79106

80-
- **Polecats**: idle+hooked agents should be dispatched by `schedulePendingWork`
81-
- **Refinery**: if it was mid-review, `recoverStuckReviews` handles it after the timeout
82-
- **Orphaned beads**: `rehookOrphanedBeads` picks up beads with stale assignees after 2 min
107+
- **Polecats**: idle+hooked agents are re-dispatched by `reconcileBeads` Rule 2
108+
- **Refinery**: if it was mid-review, the container status observation detects the dead container and sets the refinery to idle. `reconcileReviewQueue` Rule 6 re-dispatches it.
109+
- **Orphaned beads**: `reconcileBeads` Rule 3 resets in-progress beads with no working agent to open after 5 min, then Rule 1 assigns a new agent.
83110

84111
**Red flags**:
112+
85113
- `Working: 0` for more than 5 min after container is active
86-
- `orphanedHooks` growing without dispatches
114+
- `invariantViolations > 0` in reconciler metrics
87115
- `failed` count increasing rapidly (dispatch attempts burning out)
88116

89117
### Phase 4: Review Pipeline (5-15min)
@@ -95,10 +123,34 @@ in_progress → in_review → review_completed → closed
95123
```
96124

97125
Check that:
126+
98127
- The refinery picks up MR beads (status transitions to `working`)
99128
- Reviews complete as `merged` (not `Refinery container failed to start`)
100129
- Source beads reach `closed` and stay closed
101130

131+
### Phase 5: Reconciler Health
132+
133+
Verify the reconciler is running correctly:
134+
135+
```bash
136+
debug_curl "https://gastown.kiloapps.io/debug/towns/$TOWN_ID/status" | python3 -c "
137+
import sys, json; d = json.load(sys.stdin)
138+
r = d['alarmStatus'].get('reconciler')
139+
if r:
140+
print(f\"Events drained: {r['eventsDrained']}\")
141+
print(f\"Actions emitted: {r['actionsEmitted']}\")
142+
print(f\"Invariant violations: {r['invariantViolations']}\")
143+
print(f\"Wall clock: {r['wallClockMs']}ms\")
144+
print(f\"Pending events: {r['pendingEventCount']}\")
145+
if r.get('actionsByType'):
146+
print(f\"Action types: {r['actionsByType']}\")
147+
else:
148+
print('No reconciler metrics yet')
149+
"
150+
```
151+
152+
**Expected**: `invariantViolations: 0`, `wallClockMs < 100`, `pendingEventCount: 0`
153+
102154
## 3. Test Convoy
103155

104156
Create a simple test convoy to verify the full pipeline. Use the tRPC `slingConvoy` endpoint:
@@ -127,7 +179,7 @@ Then monitor until both beads reach `closed`:
127179
```bash
128180
# Poll every 30s until no non-terminal issue beads remain
129181
while true; do
130-
RESP=$(curl -s ".../debug/towns/$TOWN_ID/status")
182+
RESP=$(debug_curl "https://gastown.kiloapps.io/debug/towns/$TOWN_ID/status")
131183
ISSUES=$(echo "$RESP" | python3 -c "
132184
import sys, json
133185
beads = json.load(sys.stdin).get('beadSummary', [])
@@ -144,22 +196,24 @@ done
144196
```
145197

146198
**Expected timeline**:
147-
- 0-2 min: beads created, polecats dispatched
199+
200+
- 0-2 min: beads created, polecats dispatched by reconciler (lazy assignment)
148201
- 2-10 min: polecats work, submit reviews
149202
- 10-15 min: refinery reviews and merges
150203
- 15-25 min: second bead goes through the same cycle
151204
- 25-30 min: convoy lands (all beads closed)
152205

153206
**Failure indicators**:
154-
- Beads stuck in `open` for >5 min → check `schedulePendingWork` / agent hooks
207+
208+
- Beads stuck in `open` for >5 min → check reconciler actions (should emit `dispatch_agent`)
155209
- Beads stuck in `in_review` for >15 min → check refinery status and MR beads
156210
- MR beads stuck in `in_progress` for >5 min → check refinery dispatch retry
157-
- Beads cycling `in_progress → open` → check `dispatchAgent` failures
211+
- Beads cycling `in_progress → open` → check `agentCompleted` events and STALE_IN_PROGRESS_TIMEOUT_MS
158212
- Reviews completing as `failed` → check container start errors on refinery agent status message
159213

160214
## 4. Cleanup
161215

162-
After monitoring is complete, clean up test beads and the debug endpoint.
216+
After monitoring is complete, clean up test beads.
163217

164218
### Remove test convoy beads
165219

@@ -186,23 +240,15 @@ for b in data:
186240
done
187241
```
188242

189-
### Remove debug endpoint
190-
191-
The `/debug/towns/:townId/status` endpoint is unauthenticated and must be removed before merging to main. Remove these from the codebase:
192-
193-
1. `src/gastown.worker.ts` — the `app.get('/debug/towns/:townId/status', ...)` route
194-
2. `src/dos/Town.do.ts` — the `debugAgentMetadata()` and `debugBeadSummary()` methods
195-
3. `src/trpc/router.ts` — the `debugAgentMetadata` tRPC procedure
196-
197-
Search for `// DEBUG` comments to find all debug code to remove.
198-
199243
## 5. Key Metrics to Watch
200244

201-
| Metric | Healthy | Unhealthy |
202-
|--------|---------|-----------|
203-
| Working agents | >0 when beads exist | 0 for >5 min with open beads |
204-
| Failed bead count | Stable | Increasing rapidly |
205-
| Orphaned hooks | 0 | >0 for >5 min |
206-
| Refinery status | `working` during review, `idle` between | `idle` with in_progress MR for >5 min |
207-
| Review outcomes | `merged` | `Refinery container failed to start` |
208-
| Alarm interval | `active (5s)` with work | Stuck at same `nextFireAt` |
245+
| Metric | Healthy | Unhealthy |
246+
| --------------------- | --------------------------------------- | ------------------------------------- |
247+
| Working agents | >0 when beads exist | 0 for >5 min with open beads |
248+
| Failed bead count | Stable | Increasing rapidly |
249+
| Invariant violations | 0 | >0 (check reconciler logs) |
250+
| Refinery status | `working` during review, `idle` between | `idle` with in_progress MR for >5 min |
251+
| Review outcomes | `merged` | `Refinery container failed to start` |
252+
| Alarm interval | `active (5s)` with work | Stuck at same `nextFireAt` |
253+
| Reconciler wall clock | <100ms | >500ms consistently |
254+
| Pending event count | 0 between ticks | Growing (events not draining) |

cloudflare-gastown/scripts/monitor-town.sh

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,32 @@
11
#!/bin/bash
22
# Continuously monitor a town's state via the debug endpoint.
33
# Usage: ./scripts/monitor-town.sh [townId] [interval_seconds]
4+
#
5+
# Requires Cloudflare Access service token credentials:
6+
# export CF_ACCESS_CLIENT_ID="<service-token-client-id>"
7+
# export CF_ACCESS_CLIENT_SECRET="<service-token-client-secret>"
48

59
TOWN_ID="${1:-8a6f9375-b806-4ee0-ad6e-1697ea2dbfff}"
610
INTERVAL="${2:-15}"
711
BASE_URL="${GASTOWN_URL:-https://gastown.kiloapps.io}"
812
URL="${BASE_URL}/debug/towns/${TOWN_ID}/status"
913

14+
if [ -z "$CF_ACCESS_CLIENT_ID" ] || [ -z "$CF_ACCESS_CLIENT_SECRET" ]; then
15+
echo "Error: CF_ACCESS_CLIENT_ID and CF_ACCESS_CLIENT_SECRET must be set"
16+
echo "These are the Cloudflare Access service token credentials."
17+
exit 1
18+
fi
19+
1020
echo "Monitoring town ${TOWN_ID} every ${INTERVAL}s"
1121
echo "Endpoint: ${URL}"
1222
echo "Press Ctrl+C to stop"
1323
echo "=========================================="
1424

1525
while true; do
16-
RESP=$(curl -s --max-time 10 "${URL}" 2>/dev/null)
26+
RESP=$(curl -s --max-time 10 \
27+
-H "CF-Access-Client-Id: $CF_ACCESS_CLIENT_ID" \
28+
-H "CF-Access-Client-Secret: $CF_ACCESS_CLIENT_SECRET" \
29+
"${URL}" 2>/dev/null)
1730
if [ -z "$RESP" ]; then
1831
echo "$(date -u +%H:%M:%S) [ERROR] No response from ${URL}"
1932
sleep "$INTERVAL"
@@ -34,6 +47,7 @@ alarm = d.get('alarmStatus', {})
3447
agents_info = alarm.get('agents', {})
3548
beads_info = alarm.get('beads', {})
3649
patrol_info = alarm.get('patrol', {})
50+
recon = alarm.get('reconciler') or {}
3751
events = alarm.get('recentEvents', [])
3852
3953
working = agents_info.get('working', 0)
@@ -42,7 +56,9 @@ op = beads_info.get('open', 0)
4256
ip = beads_info.get('inProgress', 0)
4357
ir = beads_info.get('inReview', 0)
4458
failed = beads_info.get('failed', 0)
45-
orphaned = patrol_info.get('orphanedHooks', 0)
59+
violations = recon.get('invariantViolations', '-')
60+
actions = recon.get('actionsEmitted', '-')
61+
wall_ms = recon.get('wallClockMs', '-')
4662
4763
# Agent details
4864
agents = d.get('agentMeta', [])
@@ -52,7 +68,7 @@ refinery = [a for a in agents if a.get('role') == 'refinery']
5268
# Non-terminal beads
5369
beads = d.get('beadSummary', [])
5470
55-
print(f'{ts} W={working} I={idle} | open={op} prog={ip} review={ir} fail={failed} | hooks={orphaned} hooked={len(hooked_agents)}')
71+
print(f'{ts} W={working} I={idle} | open={op} prog={ip} review={ir} fail={failed} | v={violations} act={actions} ms={wall_ms}')
5672
5773
# Show refinery state
5874
for r in refinery:

0 commit comments

Comments
 (0)