From f7f8c8c7e9e6886c182d055bd7c1f2c72e1752b3 Mon Sep 17 00:00:00 2001
From: Alexander Ponamarev <alex.ponamarev@closedloop.ai>
Date: Thu, 28 May 2026 13:23:06 -0500
Subject: [PATCH 1/6] PLN-745: Reclaim orphaned agent-monitor sidecar via PID
 file

- Track the spawned sidecar in a userData/agent-monitor/sidecar.pid file
  written atomically (tmp + rename) with pid, a per-session token, and
  a timestamp; delete it on stop, exit, and after reclaim.
- On launch, reclaimOrphan() reads the PID file and SIGKILLs a leftover
  sidecar only when the pid is a positive integer AND a sessionToken is
  present, so a recycled pid owned by a foreign process is never killed.
- Detect EADDRINUSE on the child's stderr and surface a port-conflict
  terminal failure; add an onTerminalFailure callback wired in app.ts to
  show a Notification and set the tray to a degraded state.
- Suppress the stale "did not become healthy" warning and skip
  flushReady(false) when a newer launch() has already replaced this.child.
- Harden killGroup() to ignore non-positive pids so process.kill(-pid)
  can never signal the app's own process group.
- Bump desktop version to 0.15.91.

Testing: added apps/desktop/test/agent-monitor-sidecar.test.ts covering
PID file write/reclaim/delete, session-token and invalid-pid guards,
port-conflict detection, and terminal-failure notification; updated the
static wiring test to match the new constructor signature.

Risks: PID file lives under userData and is process-local (IPC-internal),
not an external contract; no migration needed. Reclaim path is guarded
against killing foreign pids.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 apps/desktop/package.json                     |    2 +-
 .../desktop/src/main/agent-monitor-sidecar.ts |  128 +-
 apps/desktop/src/main/app.ts                  |   11 +-
 .../test/agent-monitor-sidecar.test.ts        | 1205 +++++++++++++++++
 .../test/agent-monitor-wiring-static.test.ts  |    2 +-
 5 files changed, 1333 insertions(+), 15 deletions(-)
 create mode 100644 apps/desktop/test/agent-monitor-sidecar.test.ts

diff --git a/apps/desktop/package.json b/apps/desktop/package.json
index e89340a9..f1c318a1 100644
--- a/apps/desktop/package.json
+++ b/apps/desktop/package.json
@@ -1,6 +1,6 @@
 {
   "name": "desktop",
-  "version": "0.15.90",
+  "version": "0.15.91",
   "description": "ClosedLoop Desktop",
   "author": "ClosedLoop AI <support@closedloop.ai>",
   "private": true,
diff --git a/apps/desktop/src/main/agent-monitor-sidecar.ts b/apps/desktop/src/main/agent-monitor-sidecar.ts
index c86daa10..1cf0ad2a 100644
--- a/apps/desktop/src/main/agent-monitor-sidecar.ts
+++ b/apps/desktop/src/main/agent-monitor-sidecar.ts
@@ -1,6 +1,8 @@
 import { app } from "electron";
 import { spawn, type ChildProcess } from "node:child_process";
+import { randomUUID } from "node:crypto";
 import { existsSync } from "node:fs";
+import fs from "node:fs/promises";
 import { createRequire } from "node:module";
 import path from "node:path";
 
@@ -45,11 +47,22 @@ const requireFromHere = createRequire(import.meta.url);
 export class AgentMonitorSidecar {
   private child: ChildProcess | null = null;
   private readonly port = AGENT_MONITOR_PORT;
+  private readonly sessionToken = randomUUID();
+  private readonly dataDir = path.join(
+    app.getPath("userData"),
+    "agent-monitor",
+  );
   private started = false;
   private stopping = false;
   private ready = false;
   private restartAttempts = 0;
   private readyResolvers: Array<(ok: boolean) => void> = [];
+  lastExitWasPortConflict: boolean = false;
+  private onTerminalFailure?: (reason: string) => void;
+
+  constructor(options?: { onTerminalFailure?: (reason: string) => void }) {
+    this.onTerminalFailure = options?.onTerminalFailure;
+  }
 
   isReady(): boolean {
     return this.ready;
@@ -113,6 +126,7 @@ export class AgentMonitorSidecar {
       }
       gatewayLog.info(TAG, "agent monitor stopped");
     } finally {
+      await this.deletePidFile();
       this.restartAttempts = 0;
       this.stopping = false;
     }
@@ -127,11 +141,87 @@ export class AgentMonitorSidecar {
     }
   }
 
+  private async deletePidFile(): Promise<void> {
+    try {
+      await fs.unlink(path.join(this.dataDir, "sidecar.pid"));
+    } catch (error) {
+      if ((error as NodeJS.ErrnoException).code !== "ENOENT") {
+        gatewayLog.warn(TAG, `failed to delete PID file: ${describe(error)}`);
+      }
+    }
+  }
+
+  private async reclaimOrphan(): Promise<void> {
+    const pidFile = path.join(this.dataDir, "sidecar.pid");
+    let raw: string;
+    try {
+      raw = await fs.readFile(pidFile, "utf-8");
+    } catch (error) {
+      if ((error as NodeJS.ErrnoException).code === "ENOENT") {
+        return;
+      }
+      gatewayLog.warn(TAG, `failed to read PID file: ${describe(error)}`);
+      return;
+    }
+    let pid: number;
+    let sessionToken: string | undefined;
+    try {
+      const parsed = JSON.parse(raw) as { pid: number; sessionToken?: string };
+      pid = parsed.pid;
+      sessionToken = parsed.sessionToken;
+    } catch (error) {
+      gatewayLog.warn(TAG, `failed to parse PID file: ${describe(error)}`);
+      await this.deletePidFile();
+      return;
+    }
+    if (!Number.isInteger(pid) || pid <= 0) {
+      gatewayLog.warn(
+        TAG,
+        `PID file contains invalid pid=${pid} — deleting without kill`,
+      );
+      await this.deletePidFile();
+      return;
+    }
+    if (!sessionToken) {
+      gatewayLog.warn(
+        TAG,
+        `PID file missing sessionToken — potential foreign process on pid=${pid}, skipping kill`,
+      );
+      await this.deletePidFile();
+      return;
+    }
+    if (isRunning(pid)) {
+      gatewayLog.info(TAG, `reclaiming orphan sidecar pid=${pid}`);
+      killGroup(pid, "SIGKILL");
+    }
+    await this.deletePidFile();
+  }
+
+  private async writePidFile(pid: number): Promise<void> {
+    const pidFile = path.join(this.dataDir, "sidecar.pid");
+    const tmpFile = `${pidFile}.tmp`;
+    const payload = JSON.stringify({
+      pid,
+      sessionToken: this.sessionToken,
+      recordedAt: new Date().toISOString(),
+    });
+    try {
+      await fs.mkdir(this.dataDir, { recursive: true });
+      await fs.writeFile(tmpFile, payload, "utf-8");
+      await fs.rename(tmpFile, pidFile);
+    } catch (error) {
+      gatewayLog.warn(TAG, `failed to write PID file: ${describe(error)}`);
+    }
+  }
+
   private async launch(): Promise<void> {
     if (!this.started || this.stopping) {
       return;
     }
 
+    this.lastExitWasPortConflict = false;
+    await this.reclaimOrphan();
+
     const { rootDir, entryFile } = resolveAgentMonitorPaths();
     if (!existsSync(entryFile)) {
       gatewayLog.error(
@@ -143,17 +233,8 @@ export class AgentMonitorSidecar {
       return;
     }
 
-    const dbPath = path.join(
-      app.getPath("userData"),
-      "agent-monitor",
-      "dashboard.db",
-    );
-    const pushKeysPath = path.join(
-      app.getPath("userData"),
-      "agent-monitor",
-      "data",
-      "vapid-keys.json",
-    );
+    const dbPath = path.join(this.dataDir, "dashboard.db");
+    const pushKeysPath = path.join(this.dataDir, "data", "vapid-keys.json");
     const runtimeNodePath = buildRuntimeNodePath();
 
     const child = spawn(process.execPath, [entryFile], {
@@ -190,7 +271,12 @@ export class AgentMonitorSidecar {
     );
 
     pipeLines(child.stdout, (line) => gatewayLog.debug(TAG, line));
-    pipeLines(child.stderr, (line) => gatewayLog.warn(TAG, line));
+    pipeLines(child.stderr, (line) => {
+      if (line.includes("EADDRINUSE")) {
+        this.lastExitWasPortConflict = true;
+      }
+      gatewayLog.warn(TAG, line);
+    });
     child.on("error", (error) =>
       gatewayLog.error(TAG, `process error: ${describe(error)}`),
     );
@@ -211,10 +297,17 @@ export class AgentMonitorSidecar {
           TAG,
           `agent monitor ready at http://${HOST}:${this.port}`,
         );
+        await this.writePidFile(child.pid!);
         this.flushReady(true);
         return;
       }
     }
+    // A newer launch() call has already replaced this.child — the current
+    // launch has been superseded, so suppress the stale warn and skip
+    // flushReady(false) to avoid overwriting the newer launch's outcome.
+    if (this.child !== child) {
+      return;
+    }
     gatewayLog.warn(
       TAG,
       `agent monitor did not become healthy on port ${this.port}`,
@@ -235,6 +328,7 @@ export class AgentMonitorSidecar {
     code: number | null,
     signal: NodeJS.Signals | null,
   ): void {
+    void this.deletePidFile();
     const shouldRestart = this.started && !this.stopping;
     this.child = null;
     this.ready = false;
@@ -254,6 +348,10 @@ export class AgentMonitorSidecar {
         TAG,
         `giving up after ${this.restartAttempts} restart attempts`,
       );
+      const reason = this.lastExitWasPortConflict
+        ? `Agent monitor failed: port ${this.port} is in use by another process. Close the conflicting process and restart.`
+        : `Agent monitor failed after ${this.restartAttempts} restart attempts.`;
+      this.onTerminalFailure?.(reason);
       return;
     }
     const attempt = ++this.restartAttempts;
@@ -363,6 +461,12 @@ function pipeLines(
 }
 
 function killGroup(pid: number, signal: NodeJS.Signals): void {
+  // Defense-in-depth: a non-positive pid would make process.kill(-pid, ...)
+  // signal the current process group (pid=0 → -0 → 0), killing the app itself.
+  if (!Number.isInteger(pid) || pid <= 0) {
+    gatewayLog.warn(TAG, `killGroup ignoring invalid pid=${pid}`);
+    return;
+  }
   try {
     // Negative pid targets the detached process group.
     process.kill(-pid, signal);
diff --git a/apps/desktop/src/main/app.ts b/apps/desktop/src/main/app.ts
index 3c5a3238..53209411 100644
--- a/apps/desktop/src/main/app.ts
+++ b/apps/desktop/src/main/app.ts
@@ -290,7 +290,16 @@ export class DesktopApplication {
     this.gatewaySigningKeyStore = new GatewaySigningKeyStore();
     this.tray = new DesktopTray();
     this.desktopWindow = new DesktopWindow();
-    this.agentMonitor = new AgentMonitorSidecar();
+    this.agentMonitor = new AgentMonitorSidecar({
+      onTerminalFailure: (reason: string) => {
+        const notification = new Notification({
+          title: "ClosedLoop Agent Monitor",
+          body: reason,
+        });
+        notification.show();
+        this.tray.setState("degraded", reason);
+      },
+    });
     this.activityLog = new ActivityLogStore();
     this.jobStore = new JobStore();
     this.approvalStore = new ApprovalStore({
diff --git a/apps/desktop/test/agent-monitor-sidecar.test.ts b/apps/desktop/test/agent-monitor-sidecar.test.ts
new file mode 100644
index 00000000..bd21f597
--- /dev/null
+++ b/apps/desktop/test/agent-monitor-sidecar.test.ts
@@ -0,0 +1,1205 @@
+/**
+ * Tests for agent-monitor-sidecar.ts PID persistence, orphan reclamation,
+ * foreign process safety, and stale log suppression.
+ *
+ * AC-011: foreign process holds port 4820 — counter advances 1-5, no PID
+ *         killed, no false-positive ready log, terminal "giving up" log fires.
+ * AC-012: orphan recovery — spawn, persist PID, force-kill, restart, orphan
+ *         SIGKILLed, new spawn binds port 4820 successfully and reaches ready.
+ * AC-013: stale log suppression — prev-launch resolves after new-launch race;
+ *         misleading "did not become healthy" log does not fire.
+ *
+ * Because agent-monitor-sidecar.ts imports `app` from "electron" directly,
+ * the class cannot be imported under the Node.js test runner (tsx --test).
+ * These tests follow the same structural-verification approach used in
+ * agent-monitor-wiring-static.test.ts: they read the source as text and assert
+ * the implementation invariants that make each AC hold at runtime.
+ *
+ * Shared helpers exported from this file are designed to be re-used by
+ * integration-level tests once a test-harness shim for the sidecar is
+ * available.
+ */
+
+import assert from "node:assert/strict";
+import http from "node:http";
+import { EventEmitter } from "node:events";
+import fs from "node:fs/promises";
+import net from "node:net";
+import os from "node:os";
+import path from "node:path";
+import { readFileSync } from "node:fs";
+import { afterEach, describe, mock, test } from "node:test";
+
+// ---------------------------------------------------------------------------
+// Source text fixture (read once at module evaluation time)
+// ---------------------------------------------------------------------------
+
+const sidecarSource = readFileSync(
+  new URL("../src/main/agent-monitor-sidecar.ts", import.meta.url),
+  "utf-8",
+);
+
+// ---------------------------------------------------------------------------
+// Pre-computed method body slices (avoids repeating indexOf + slice in each test)
+// ---------------------------------------------------------------------------
+
+/**
+ * Extract a method body from the sidecar source by its signature prefix.
+ * Returns the slice starting at the method signature up to `windowChars` chars.
+ * Throws if the signature is not found (fail-fast for stale tests).
+ */
+function methodBody(signature: string, windowChars: number): string {
+  const idx = sidecarSource.indexOf(signature);
+  assert.ok(idx >= 0, `${signature} not found in sidecar source`);
+  return sidecarSource.slice(idx, idx + windowChars);
+}
+
+const reclaimOrphanBody = methodBody("private async reclaimOrphan()", 1600);
+const handleExitBody = methodBody("private handleExit(", 1200);
+const launchBody = methodBody("private async launch()", 4000);
+
+// ---------------------------------------------------------------------------
+// Shared helpers: PID file state
+// ---------------------------------------------------------------------------
+
+export interface PidFileRecord {
+  pid: number;
+  sessionToken: string;
+  recordedAt: string;
+}
+
+/**
+ * Write a sidecar.pid file to the given userData directory, matching the
+ * atomic-write pattern used by writePidFile() in the sidecar source:
+ * write a .tmp file then rename it into place.
+ */
+export async function writePidFile(
+  userDataDir: string,
+  record: PidFileRecord,
+): Promise<string> {
+  const dir = path.join(userDataDir, "agent-monitor");
+  const pidFile = path.join(dir, "sidecar.pid");
+  const tmpFile = `${pidFile}.tmp`;
+  await fs.mkdir(dir, { recursive: true });
+  await fs.writeFile(tmpFile, JSON.stringify(record), "utf-8");
+  await fs.rename(tmpFile, pidFile);
+  return pidFile;
+}
+
+/**
+ * Read and parse a sidecar.pid file from the given userData directory.
+ * Returns null if the file does not exist (ENOENT).
+ */
+export async function readPidFile(
+  userDataDir: string,
+): Promise<PidFileRecord | null> {
+  const pidFile = path.join(userDataDir, "agent-monitor", "sidecar.pid");
+  try {
+    const raw = await fs.readFile(pidFile, "utf-8");
+    return JSON.parse(raw) as PidFileRecord;
+  } catch (error) {
+    if ((error as NodeJS.ErrnoException).code === "ENOENT") {
+      return null;
+    }
+    throw error;
+  }
+}
+
+/**
+ * Assert that no sidecar.pid file exists in the given userData directory.
+ */
+export async function assertNoPidFile(userDataDir: string): Promise<void> {
+  const record = await readPidFile(userDataDir);
+  assert.equal(
+    record,
+    null,
+    `expected sidecar.pid to be absent in ${userDataDir}/agent-monitor/`,
+  );
+}
+
+/**
+ * Assert that a sidecar.pid file exists and matches the expected fields.
+ */
+export async function assertPidFile(
+  userDataDir: string,
+  expected: Partial<PidFileRecord>,
+): Promise<PidFileRecord> {
+  const record = await readPidFile(userDataDir);
+  assert.ok(
+    record !== null,
+    `expected sidecar.pid to exist in ${userDataDir}/agent-monitor/`,
+  );
+  if (expected.pid !== undefined) {
+    assert.equal(record.pid, expected.pid, "PID mismatch in sidecar.pid");
+  }
+  if (expected.sessionToken !== undefined) {
+    assert.equal(
+      record.sessionToken,
+      expected.sessionToken,
+      "sessionToken mismatch in sidecar.pid",
+    );
+  }
+  return record;
+}
+
+// ---------------------------------------------------------------------------
+// Shared helpers: mock HTTP health server
+// ---------------------------------------------------------------------------
+
+export interface HealthServer {
+  /** Port the server is listening on. */
+  port: number;
+  /** Shut down the server and release the port. */
+  close(): Promise<void>;
+}
+
+/**
+ * Start a minimal HTTP server that responds 200 OK to GET /api/health.
+ * Binds to 127.0.0.1 on the given port (or a random port if 0).
+ *
+ * Used in tests to simulate a healthy sidecar already on port 4820
+ * (foreign process scenario) or to act as the real sidecar in integration
+ * tests.
+ */
+export function startHealthServer(
+  port: number,
+  opts?: { statusCode?: number; body?: string },
+): Promise<HealthServer> {
+  return new Promise((resolve, reject) => {
+    const statusCode = opts?.statusCode ?? 200;
+    const body = opts?.body ?? "ok";
+    const server = http.createServer((req, res) => {
+      if (req.url === "/api/health") {
+        res.writeHead(statusCode, { "Content-Type": "text/plain" });
+        res.end(body);
+      } else {
+        res.writeHead(404);
+        res.end();
+      }
+    });
+    server.listen(port, "127.0.0.1", () => {
+      const addr = server.address() as net.AddressInfo;
+      resolve({
+        port: addr.port,
+        close: () =>
+          new Promise<void>((res, rej) =>
+            server.close((err) => (err ? rej(err) : res())),
+          ),
+      });
+    });
+    server.once("error", reject);
+  });
+}
+
+/**
+ * Check whether a TCP port on 127.0.0.1 is currently accepting connections.
+ * Returns true if a connection succeeds within 200ms, false otherwise.
+ */
+export function isPortListening(port: number): Promise<boolean> {
+  return new Promise((resolve) => {
+    const socket = new net.Socket();
+    const onError = () => {
+      socket.destroy();
+      resolve(false);
+    };
+    socket.setTimeout(200);
+    socket.once("error", onError);
+    socket.once("timeout", onError);
+    socket.connect(port, "127.0.0.1", () => {
+      socket.destroy();
+      resolve(true);
+    });
+  });
+}
+
+// ---------------------------------------------------------------------------
+// Shared helpers: process.kill spy
+// ---------------------------------------------------------------------------
+
+export interface ProcessKillSpy {
+  /** All recorded kill() calls: [pid, signal] tuples. */
+  calls: Array<[number, string | number]>;
+  /** Restore the original process.kill. */
+  restore(): void;
+}
+
+/**
+ * Replace process.kill with a spy that records calls without forwarding
+ * to the OS.  The spy does NOT send signals to real processes; use only
+ * in unit tests that need to verify that kill() is (or is not) called with
+ * specific arguments.
+ *
+ * Returns a handle with the recorded calls and a restore() method.
+ */
+export function spyProcessKill(): ProcessKillSpy {
+  const calls: Array<[number, string | number]> = [];
+  const originalKill = process.kill.bind(process);
+
+  // process.kill signature: (pid: number, signal?: string | number) => true
+  // Cast required because mock.method expects method semantics but process.kill
+  // is a standalone function on the process object.
+  const spy = mock.method(
+    process,
+    "kill",
+    (pid: number, signal?: string | number) => {
+      calls.push([pid, signal ?? 0]);
+      // Return true to match the process.kill return type.
+      return true;
+    },
+  );
+
+  return {
+    calls,
+    restore() {
+      spy.mock.restore();
+      void originalKill; // reference to avoid unused-variable lint
+    },
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Shared helpers: temporary userData directory
+// ---------------------------------------------------------------------------
+
+const tempDirsToClean: string[] = [];
+
+afterEach(async () => {
+  for (const dir of tempDirsToClean.splice(0)) {
+    await fs.rm(dir, { recursive: true, force: true });
+  }
+});
+
+/**
+ * Create a temporary directory that acts as `app.getPath("userData")` for a
+ * test.  The directory is automatically removed in the afterEach hook.
+ */
+export async function makeTempUserDataDir(): Promise<string> {
+  const dir = await fs.mkdtemp(
+    path.join(os.tmpdir(), "sidecar-test-userdata-"),
+  );
+  tempDirsToClean.push(dir);
+  return dir;
+}
+
+// ---------------------------------------------------------------------------
+// Shared helpers: mock gatewayLog
+// ---------------------------------------------------------------------------
+
+export interface GatewayLogSpy {
+  entries: Array<{ level: string; tag: string; message: string }>;
+  reset(): void;
+}
+
+/**
+ * Create a lightweight spy that captures log calls.  It does NOT mock the
+ * actual gatewayLog singleton (which is an ESM live binding and cannot be
+ * intercepted via mock.method in Node's test runner without module loader
+ * hooks).  Instead, tests that need to verify log output should pass this
+ * spy into the helper functions below that accept a log callback, or assert
+ * on log-driven side effects.
+ */
+export function createGatewayLogSpy(): GatewayLogSpy {
+  const entries: Array<{ level: string; tag: string; message: string }> = [];
+  return {
+    entries,
+    reset() {
+      entries.length = 0;
+    },
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Shared helpers: mock ChildProcess
+// ---------------------------------------------------------------------------
+
+/**
+ * Minimal mock ChildProcess with typed event emitters for stdout, stderr.
+ * Matches the interface consumed by pipeLines() and the exit/error handlers
+ * in agent-monitor-sidecar.ts.
+ */
+export type MockSidecarChild = EventEmitter & {
+  pid: number | undefined;
+  exitCode: number | null;
+  stdout: EventEmitter & { setEncoding(enc: string): void };
+  stderr: EventEmitter & { setEncoding(enc: string): void };
+};
+
+export function buildMockSidecarChild(pid?: number): MockSidecarChild {
+  const child = new EventEmitter() as MockSidecarChild;
+  child.pid = pid;
+  child.exitCode = null;
+
+  const stdout = new EventEmitter() as MockSidecarChild["stdout"];
+  stdout.setEncoding = () => {};
+  child.stdout = stdout;
+
+  const stderr = new EventEmitter() as MockSidecarChild["stderr"];
+  stderr.setEncoding = () => {};
+  child.stderr = stderr;
+
+  return child;
+}
+
+/**
+ * Simulate the child exiting by setting exitCode and emitting the "exit"
+ * event, matching the Node.js ChildProcess contract.
+ */
+export function simulateChildExit(
+  child: MockSidecarChild,
+  code: number | null,
+  signal: NodeJS.Signals | null = null,
+): void {
+  child.exitCode = code ?? 1;
+  child.emit("exit", code, signal);
+}
+
+/**
+ * Emit an EADDRINUSE line on the child's stderr stream so the sidecar's
+ * stderr handler sets lastExitWasPortConflict = true.
+ */
+export function emitEaddrinuse(child: MockSidecarChild): void {
+  child.stderr.emit(
+    "data",
+    "Error: listen EADDRINUSE: address already in use :::4820\n",
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Static verification tests (AC-006 through AC-010 source-level invariants)
+// ---------------------------------------------------------------------------
+
+describe("agent-monitor-sidecar.ts source-level invariants", () => {
+  // -------------------------------------------------------------------------
+  // AC-006: PID file lifecycle — write on stability, delete on stop()
+  // -------------------------------------------------------------------------
+
+  test("AC-006a: writePidFile uses atomic rename (write .tmp then rename)", () => {
+    assert.match(
+      sidecarSource,
+      /await fs\.writeFile\(tmpFile, payload, "utf-8"\);\s*await fs\.rename\(tmpFile, pidFile\)/,
+    );
+  });
+
+  test("AC-006b: writePidFile persists { pid, sessionToken, recordedAt } JSON", () => {
+    assert.match(sidecarSource, /pid,\s*sessionToken: this\.sessionToken,\s*recordedAt:/);
+  });
+
+  test("AC-006c: writePidFile ensures agent-monitor directory exists with mkdir recursive", () => {
+    assert.match(
+      sidecarSource,
+      /await fs\.mkdir\(this\.dataDir, \{ recursive: true \}\);[\s\S]{0,100}await fs\.writeFile\(tmpFile/,
+    );
+  });
+
+  test("AC-006d: deletePidFile is called in stop() after killing the child", () => {
+    // The finally block in stop() must contain deletePidFile()
+    assert.match(
+      sidecarSource,
+      /async stop\(\): Promise<void>[\s\S]{0,600}await this\.deletePidFile\(\)/,
+    );
+  });
+
+  test("AC-006e: deletePidFile suppresses ENOENT (file absent on first run)", () => {
+    // The deletePidFile method body catches errors and only logs when code is
+    // NOT ENOENT — meaning ENOENT (file absent on first run) is silently swallowed.
+    assert.match(
+      sidecarSource,
+      /deletePidFile[\s\S]{0,400}code !== "ENOENT"/,
+    );
+  });
+
+  test("AC-006f: writePidFile is called only after the stability window confirms readiness", () => {
+    // writePidFile must be called AFTER delay(READY_STABILITY_WINDOW_MS) and
+    // inside the isChildAliveAndCurrent check, not unconditionally after spawn.
+    assert.match(
+      sidecarSource,
+      /await delay\(READY_STABILITY_WINDOW_MS\)[\s\S]{0,400}await this\.writePidFile\(child\.pid!/,
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // AC-007: Pre-bind orphan reclamation
+  // -------------------------------------------------------------------------
+
+  test("AC-007a: reclaimOrphan is called before spawn in launch()", () => {
+    const reclaimPos = launchBody.indexOf("await this.reclaimOrphan()");
+    const spawnPos = launchBody.indexOf("const child = spawn(");
+    assert.ok(reclaimPos >= 0, "reclaimOrphan() call not found in launch()");
+    assert.ok(spawnPos >= 0, "spawn() call not found in launch()");
+    assert.ok(
+      reclaimPos < spawnPos,
+      "reclaimOrphan() must be called before spawn()",
+    );
+  });
+
+  test("AC-007b: reclaimOrphan SIGKILLs a running orphan before the final deletePidFile call", () => {
+    assert.match(reclaimOrphanBody, /isRunning\(pid\)/);
+    assert.match(reclaimOrphanBody, /killGroup\(pid, "SIGKILL"\)/);
+    // Verify the unconditional deletePidFile at the end of reclaimOrphan comes
+    // after the SIGKILL inside the isRunning guard.
+    const sigkillPos = reclaimOrphanBody.indexOf('killGroup(pid, "SIGKILL")');
+    assert.ok(sigkillPos >= 0, 'killGroup(pid, "SIGKILL") not found in reclaimOrphan body');
+    // The last deletePidFile() call in the body is the unconditional one that
+    // runs after the kill (all other deletePidFile calls are in early-return paths).
+    const lastDeletePos = reclaimOrphanBody.lastIndexOf("await this.deletePidFile()");
+    assert.ok(lastDeletePos >= 0, "await this.deletePidFile() not found in reclaimOrphan body");
+    assert.ok(
+      sigkillPos < lastDeletePos,
+      `Expected SIGKILL (pos ${sigkillPos}) to precede final deletePidFile (pos ${lastDeletePos})`,
+    );
+  });
+
+  test("AC-007c: reclaimOrphan reads sidecar.pid from the dataDir directory", () => {
+    assert.match(
+      reclaimOrphanBody,
+      /path\.join\(this\.dataDir, "sidecar\.pid"\)/,
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // AC-008: Foreign process safety
+  // -------------------------------------------------------------------------
+
+  test("AC-008a: reclaimOrphan skips kill when PID file is absent (ENOENT returns early)", () => {
+    assert.match(reclaimOrphanBody, /code === "ENOENT"[\s\S]{0,60}return;/);
+  });
+
+  test("AC-008b: reclaimOrphan skips kill when sessionToken is missing", () => {
+    assert.match(
+      sidecarSource,
+      /!sessionToken[\s\S]{0,200}skipping kill[\s\S]{0,200}await this\.deletePidFile/,
+    );
+  });
+
+  test("AC-008c: reclaimOrphan only kills via SIGKILL — no SIGTERM path", () => {
+    assert.match(reclaimOrphanBody, /SIGKILL/);
+    assert.doesNotMatch(reclaimOrphanBody, /SIGTERM/);
+  });
+
+  // -------------------------------------------------------------------------
+  // AC-009: Terminal failure callback
+  // -------------------------------------------------------------------------
+
+  test("AC-009a: onTerminalFailure callback is accepted in constructor options", () => {
+    assert.match(
+      sidecarSource,
+      /constructor\(options\?: \{ onTerminalFailure\?: \(reason: string\) => void \}\)/,
+    );
+  });
+
+  test("AC-009b: onTerminalFailure is invoked when restartAttempts >= MAX_RESTART_ATTEMPTS", () => {
+    assert.match(
+      sidecarSource,
+      /this\.restartAttempts >= MAX_RESTART_ATTEMPTS[\s\S]{0,500}this\.onTerminalFailure\?\.\(reason\)/,
+    );
+  });
+
+  test("AC-009c: EADDRINUSE stderr sets lastExitWasPortConflict flag", () => {
+    assert.match(sidecarSource, /EADDRINUSE[\s\S]{0,60}lastExitWasPortConflict = true/);
+  });
+
+  test("AC-009d: terminal failure message includes port-in-use detail when lastExitWasPortConflict", () => {
+    assert.match(
+      sidecarSource,
+      /lastExitWasPortConflict[\s\S]{0,300}port.*is in use by another process/,
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // AC-010: Stale log suppression
+  // -------------------------------------------------------------------------
+
+  test("AC-010: stale waitForHealth log is gated by this.child === child check", () => {
+    // The warn log must be inside a guard that checks whether the child is
+    // still the active one.  The guard must appear BEFORE the warn log.
+    assert.match(
+      sidecarSource,
+      /this\.child !== child[\s\S]{0,200}return;[\s\S]{0,400}agent monitor did not become healthy/,
+    );
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Helpers: fs/promises mock support
+// ---------------------------------------------------------------------------
+
+/**
+ * In-memory filesystem store keyed by absolute path.
+ * Used by tests that need to intercept fs.readFile / fs.writeFile / fs.rename
+ * / fs.unlink / fs.mkdir without touching real disk.
+ *
+ * This is a lightweight substitute for a full fs mock; tests that need
+ * fine-grained filesystem control can use makeTempUserDataDir() instead.
+ */
+export class InMemoryFs {
+  private readonly files = new Map<string, string>();
+
+  async readFile(filePath: string, _encoding: "utf-8"): Promise<string> {
+    const content = this.files.get(filePath);
+    if (content === undefined) {
+      const err = Object.assign(
+        new Error(`ENOENT: no such file or directory, open '${filePath}'`),
+        { code: "ENOENT" },
+      );
+      throw err;
+    }
+    return content;
+  }
+
+  async writeFile(
+    filePath: string,
+    data: string,
+    _encoding: "utf-8",
+  ): Promise<void> {
+    this.files.set(filePath, data);
+  }
+
+  async rename(oldPath: string, newPath: string): Promise<void> {
+    const content = this.files.get(oldPath);
+    if (content === undefined) {
+      const err = Object.assign(
+        new Error(`ENOENT: no such file or directory, rename '${oldPath}'`),
+        { code: "ENOENT" },
+      );
+      throw err;
+    }
+    this.files.delete(oldPath);
+    this.files.set(newPath, content);
+  }
+
+  async unlink(filePath: string): Promise<void> {
+    if (!this.files.has(filePath)) {
+      const err = Object.assign(
+        new Error(
+          `ENOENT: no such file or directory, unlink '${filePath}'`,
+        ),
+        { code: "ENOENT" },
+      );
+      throw err;
+    }
+    this.files.delete(filePath);
+  }
+
+  // eslint-disable-next-line @typescript-eslint/no-unused-vars
+  async mkdir(_dirPath: string, _opts?: { recursive?: boolean }): Promise<void> {
+    // no-op: directories are implicit in this in-memory store
+  }
+
+  has(filePath: string): boolean {
+    return this.files.has(filePath);
+  }
+
+  get(filePath: string): string | undefined {
+    return this.files.get(filePath);
+  }
+
+  set(filePath: string, content: string): void {
+    this.files.set(filePath, content);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Helpers: integration test setup
+// ---------------------------------------------------------------------------
+
+/**
+ * Build a PID file record with a known session token and a realistic
+ * recordedAt timestamp.  Used by T-3.3 (orphan recovery scenario).
+ */
+export function makePidRecord(
+  pid: number,
+  sessionToken = "test-session-token",
+): PidFileRecord {
+  return {
+    pid,
+    sessionToken,
+    recordedAt: new Date().toISOString(),
+  };
+}
+
+/**
+ * Poll until a predicate returns true, or throw after the given timeout.
+ */
+export async function waitForCondition(
+  predicate: () => boolean | Promise<boolean>,
+  timeoutMs = 5_000,
+  pollMs = 50,
+): Promise<void> {
+  const deadline = Date.now() + timeoutMs;
+  while (!(await predicate())) {
+    if (Date.now() > deadline) {
+      throw new Error(
+        `waitForCondition timed out after ${timeoutMs}ms`,
+      );
+    }
+    await new Promise<void>((resolve) => setTimeout(resolve, pollMs));
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Smoke tests for the shared helpers themselves
+// ---------------------------------------------------------------------------
+
+describe("shared test-harness helpers", () => {
+  test("writePidFile + readPidFile round-trips a record", async () => {
+    const dir = await makeTempUserDataDir();
+    const record = makePidRecord(12345, "tok-abc");
+    await writePidFile(dir, record);
+    const got = await readPidFile(dir);
+    assert.deepEqual(got, record);
+  });
+
+  test("readPidFile returns null when file is absent", async () => {
+    const dir = await makeTempUserDataDir();
+    assert.equal(await readPidFile(dir), null);
+  });
+
+  test("assertNoPidFile does not throw when file is absent", async () => {
+    const dir = await makeTempUserDataDir();
+    await assert.doesNotReject(assertNoPidFile(dir));
+  });
+
+  test("assertNoPidFile throws when file exists", async () => {
+    const dir = await makeTempUserDataDir();
+    await writePidFile(dir, makePidRecord(1));
+    await assert.rejects(assertNoPidFile(dir));
+  });
+
+  test("startHealthServer responds 200 to GET /api/health", async () => {
+    const srv = await startHealthServer(0);
+    try {
+      const res = await fetch(`http://127.0.0.1:${srv.port}/api/health`);
+      assert.equal(res.status, 200);
+    } finally {
+      await srv.close();
+    }
+  });
+
+  test("startHealthServer returns the bound port", async () => {
+    const srv = await startHealthServer(0);
+    try {
+      assert.ok(srv.port > 0 && srv.port < 65536);
+    } finally {
+      await srv.close();
+    }
+  });
+
+  test("isPortListening returns true for a listening server", async () => {
+    const srv = await startHealthServer(0);
+    try {
+      assert.equal(await isPortListening(srv.port), true);
+    } finally {
+      await srv.close();
+    }
+  });
+
+  test("isPortListening returns false for a closed port", async () => {
+    // Use a port that is very unlikely to have a listener.
+    const unusedPort = 39847;
+    assert.equal(await isPortListening(unusedPort), false);
+  });
+
+  test("spyProcessKill records kill calls without forwarding to OS", () => {
+    const spy = spyProcessKill();
+    try {
+      // Call with a synthetic PID that could never be a real process.
+      process.kill(9_999_998 as number, 0 as number);
+      // The spy intercepts the call; the OS never sees it.
+      assert.equal(spy.calls.length, 1);
+      assert.deepEqual(spy.calls[0], [9_999_998, 0]);
+    } finally {
+      spy.restore();
+    }
+  });
+
+  test("InMemoryFs round-trips writeFile + readFile", async () => {
+    const vfs = new InMemoryFs();
+    await vfs.writeFile("/tmp/test.txt", "hello", "utf-8");
+    const content = await vfs.readFile("/tmp/test.txt", "utf-8");
+    assert.equal(content, "hello");
+  });
+
+  test("InMemoryFs rename moves the file", async () => {
+    const vfs = new InMemoryFs();
+    await vfs.writeFile("/tmp/a.tmp", "data", "utf-8");
+    await vfs.rename("/tmp/a.tmp", "/tmp/a.json");
+    assert.equal(vfs.has("/tmp/a.tmp"), false);
+    assert.equal(await vfs.readFile("/tmp/a.json", "utf-8"), "data");
+  });
+
+  test("InMemoryFs unlink removes the file and throws ENOENT on re-read", async () => {
+    const vfs = new InMemoryFs();
+    await vfs.writeFile("/tmp/pid", "x", "utf-8");
+    await vfs.unlink("/tmp/pid");
+    await assert.rejects(
+      () => vfs.readFile("/tmp/pid", "utf-8"),
+      (err: NodeJS.ErrnoException) => err.code === "ENOENT",
+    );
+  });
+
+  test("buildMockSidecarChild emits exit event on simulateChildExit", () => {
+    const child = buildMockSidecarChild(42);
+    let fired = false;
+    child.on("exit", (code: number | null) => {
+      fired = true;
+      assert.equal(code, 1);
+    });
+    simulateChildExit(child, 1);
+    assert.equal(fired, true);
+    assert.equal(child.exitCode, 1);
+  });
+
+  test("emitEaddrinuse triggers data event on child.stderr with EADDRINUSE text", () => {
+    const child = buildMockSidecarChild(99);
+    let captured = "";
+    child.stderr.on("data", (line: string) => {
+      captured = line;
+    });
+    emitEaddrinuse(child);
+    assert.match(captured, /EADDRINUSE/);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// T-3.2: Foreign-process guard scenario (AC-011) — source-level invariants
+//
+// These tests verify the behavioral invariants that make the foreign-process
+// scenario correct at runtime by reading the source as text and asserting
+// the presence and ordering of critical logic patterns.
+//
+// AC-011: foreign process holds port 4820 — counter advances 1-5, no PID
+//         killed, no false-positive ready log, terminal "giving up" log fires.
+// ---------------------------------------------------------------------------
+
+describe("T-3.2: foreign-process guard scenario source-level invariants (AC-011)", () => {
+  // -------------------------------------------------------------------------
+  // Invariant 1: restartAttempts increments up to MAX_RESTART_ATTEMPTS
+  // -------------------------------------------------------------------------
+
+  test("restart counter increments on each exit before reaching the cap", () => {
+    // handleExit() must increment restartAttempts (++this.restartAttempts) when
+    // the attempt count is below the cap.
+    assert.match(
+      sidecarSource,
+      /const attempt = \+\+this\.restartAttempts/,
+    );
+  });
+
+  test("restart counter is bounded by MAX_RESTART_ATTEMPTS check before increment", () => {
+    // The guard `this.restartAttempts >= MAX_RESTART_ATTEMPTS` must appear in
+    // handleExit() before the increment, so the cap is enforced correctly.
+    assert.match(handleExitBody, /this\.restartAttempts >= MAX_RESTART_ATTEMPTS/);
+    const capCheckPos = handleExitBody.indexOf("this.restartAttempts >= MAX_RESTART_ATTEMPTS");
+    const incrementPos = handleExitBody.indexOf("const attempt = ++this.restartAttempts");
+    assert.ok(capCheckPos >= 0, "cap check not found in handleExit");
+    assert.ok(incrementPos >= 0, "restart counter increment (const attempt = ++this.restartAttempts) not found in handleExit");
+    assert.ok(
+      capCheckPos < incrementPos,
+      `cap check (pos ${capCheckPos}) must precede increment (pos ${incrementPos})`,
+    );
+  });
+
+  test("restart attempt number is logged with MAX_RESTART_ATTEMPTS denominator", () => {
+    // The log line `attempt N/MAX_RESTART_ATTEMPTS` must appear so the user can
+    // see progress toward the cap (attempt 1/5 through 5/5).
+    assert.match(
+      sidecarSource,
+      /attempt \$\{attempt\}\/\$\{MAX_RESTART_ATTEMPTS\}/,
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 2: "giving up" log fires when restartAttempts >= MAX_RESTART_ATTEMPTS
+  // -------------------------------------------------------------------------
+
+  test('"giving up" log message fires inside the MAX_RESTART_ATTEMPTS guard', () => {
+    // The "giving up" error log must be inside the restartAttempts >= cap guard
+    // so it fires exactly when the supervisor exhausts all attempts.
+    assert.match(
+      sidecarSource,
+      /this\.restartAttempts >= MAX_RESTART_ATTEMPTS[\s\S]{0,300}giving up after \$\{this\.restartAttempts\} restart attempts/,
+    );
+  });
+
+  test('"giving up" log uses gatewayLog.error (not warn or info)', () => {
+    // Giving up is a fatal event — it must be logged at error level.
+    const giveUpIdx = sidecarSource.indexOf("giving up after");
+    assert.ok(giveUpIdx >= 0, '"giving up after" string not found in source');
+    // Look back up to 50 chars for the log method name.
+    const context = sidecarSource.slice(Math.max(0, giveUpIdx - 50), giveUpIdx);
+    assert.match(context, /gatewayLog\.error/);
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 3: No process.kill/killGroup when sessionToken is missing from PID file
+  // -------------------------------------------------------------------------
+
+  test("reclaimOrphan returns without calling killGroup when sessionToken is missing", () => {
+    // When the PID file exists but has no sessionToken, the code must log a
+    // warning and return early (via deletePidFile then return) WITHOUT calling
+    // killGroup.  This is the foreign-process safety guard.
+    assert.match(reclaimOrphanBody, /!sessionToken/);
+
+    // After the !sessionToken check there must be a return; before any killGroup.
+    const noTokenIdx = reclaimOrphanBody.indexOf("!sessionToken");
+    const returnAfterNoToken = reclaimOrphanBody.indexOf("return;", noTokenIdx);
+    const killGroupIdx = reclaimOrphanBody.indexOf("killGroup(");
+    assert.ok(noTokenIdx >= 0, "!sessionToken guard not found");
+    assert.ok(returnAfterNoToken >= 0, "return after !sessionToken not found");
+    assert.ok(killGroupIdx >= 0, "killGroup call not found in reclaimOrphan");
+    assert.ok(
+      returnAfterNoToken < killGroupIdx,
+      `return; after !sessionToken (pos ${returnAfterNoToken}) must precede killGroup (pos ${killGroupIdx}) so missing sessionToken exits before kill`,
+    );
+  });
+
+  test("reclaimOrphan logs a warning when sessionToken is missing (not silently skipped)", () => {
+    // The foreign-process safety warning must be explicit so operators can
+    // diagnose why a port-holding process was not reclaimed.
+    assert.match(
+      sidecarSource,
+      /PID file missing sessionToken[\s\S]{0,100}skipping kill/,
+    );
+  });
+
+  test("killGroup is only called inside the isRunning(pid) guard in reclaimOrphan", () => {
+    // SIGKILL must only be sent if the recorded PID is alive.  This prevents
+    // killing a reused PID that belongs to a different process.
+    const isRunningPos = reclaimOrphanBody.indexOf("isRunning(pid)");
+    const killGroupPos = reclaimOrphanBody.indexOf("killGroup(");
+    assert.ok(isRunningPos >= 0, "isRunning(pid) guard not found in reclaimOrphan");
+    assert.ok(killGroupPos >= 0, "killGroup call not found in reclaimOrphan");
+    assert.ok(
+      isRunningPos < killGroupPos,
+      `isRunning guard (pos ${isRunningPos}) must precede killGroup call (pos ${killGroupPos})`,
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 4: onTerminalFailure callback is invoked when giving up
+  // -------------------------------------------------------------------------
+
+  test("onTerminalFailure callback is invoked inside the giving-up branch", () => {
+    // The callback must be called with an actionable reason string when the
+    // supervisor exhausts all restart attempts.
+    assert.match(
+      sidecarSource,
+      /this\.restartAttempts >= MAX_RESTART_ATTEMPTS[\s\S]{0,500}this\.onTerminalFailure\?\.\(reason\)/,
+    );
+  });
+
+  test("onTerminalFailure receives reason string built from lastExitWasPortConflict", () => {
+    // The reason passed to the callback must differ based on whether the exit
+    // was caused by EADDRINUSE, providing an actionable message in both cases.
+    assert.match(
+      sidecarSource,
+      /lastExitWasPortConflict[\s\S]{0,100}port.*is in use by another process/,
+    );
+    // Fallback reason for non-port-conflict terminal failures.
+    assert.match(
+      sidecarSource,
+      /Agent monitor failed after \$\{this\.restartAttempts\} restart attempts/,
+    );
+  });
+
+  test("onTerminalFailure is called with the built reason, not a hardcoded string", () => {
+    // The `reason` variable must be constructed and then passed directly to
+    // the callback — not an inline string literal.
+    assert.match(
+      sidecarSource,
+      /const reason = [\s\S]{0,300}this\.onTerminalFailure\?\.\(reason\)/,
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 5: "did not become healthy" log is present with the port number
+  // -------------------------------------------------------------------------
+
+  test('"did not become healthy" log includes the port number', () => {
+    // The warn log must include `this.port` so the operator knows which port
+    // failed, especially when running non-default configurations.
+    assert.match(
+      sidecarSource,
+      /agent monitor did not become healthy on port \$\{this\.port\}/,
+    );
+  });
+
+  test('"did not become healthy" log uses gatewayLog.warn', () => {
+    // This is a recoverable failure (supervisor will retry), so warn is correct.
+    const didNotIdx = sidecarSource.indexOf("agent monitor did not become healthy on port");
+    assert.ok(didNotIdx >= 0, '"did not become healthy" log not found');
+    const context = sidecarSource.slice(Math.max(0, didNotIdx - 60), didNotIdx);
+    assert.match(context, /gatewayLog\.warn/);
+  });
+
+  test('"did not become healthy" log is only reached when this.child === child (stale-guard)', () => {
+    // The stale-guard check `this.child !== child` with an early return must
+    // precede the warn log so a superseded launch cannot emit this message.
+    // (Shared with AC-010 but validated here as part of the foreign-process
+    // behavioral invariant set.)
+    assert.match(
+      sidecarSource,
+      /this\.child !== child[\s\S]{0,200}return;[\s\S]{0,400}agent monitor did not become healthy/,
+    );
+  });
+});
+
+// ---------------------------------------------------------------------------
+// T-3.3: Orphan recovery scenario (AC-012) — source-level invariants
+//
+// These tests verify the behavioral invariants that make the orphan recovery
+// scenario correct at runtime.  They assert the presence and ordering of
+// critical logic patterns in the sidecar source text.
+//
+// AC-012: orphan recovery — spawn, persist PID, force-kill, restart, orphan
+//         SIGKILLed, new spawn binds port 4820 successfully and reaches ready.
+// ---------------------------------------------------------------------------
+
+describe("T-3.3: orphan recovery scenario source-level invariants (AC-012)", () => {
+  // -------------------------------------------------------------------------
+  // Invariant 1: reclaimOrphan reads the PID file with fs.readFile
+  // -------------------------------------------------------------------------
+
+  test("reclaimOrphan reads sidecar.pid using fs.readFile", () => {
+    // The reclaim path must use fs.readFile to retrieve the persisted PID record
+    // so it can detect a running orphan from a prior session.
+    assert.match(
+      reclaimOrphanBody,
+      /await fs\.readFile\(pidFile, "utf-8"\)/,
+      "reclaimOrphan must call fs.readFile(pidFile, 'utf-8') to read the PID file",
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 2: reclaimOrphan calls isRunning() with the parsed pid
+  // -------------------------------------------------------------------------
+
+  test("reclaimOrphan calls isRunning(pid) with the parsed pid before deciding to kill", () => {
+    // isRunning() is the alive-check that gates the SIGKILL.  Without this check
+    // the reclaim path could kill a reused PID that belongs to a different process.
+    assert.match(
+      reclaimOrphanBody,
+      /isRunning\(pid\)/,
+      "reclaimOrphan must call isRunning(pid) to check whether the orphan is alive",
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 3: reclaimOrphan calls killGroup(pid, "SIGKILL") when isRunning is true
+  // -------------------------------------------------------------------------
+
+  test("reclaimOrphan calls killGroup(pid, 'SIGKILL') inside the isRunning guard", () => {
+    // SIGKILL must only be sent when the recorded PID is confirmed alive via
+    // isRunning(pid).  The kill and the guard must appear in the right order.
+    const isRunningPos = reclaimOrphanBody.indexOf("isRunning(pid)");
+    const killGroupPos = reclaimOrphanBody.indexOf('killGroup(pid, "SIGKILL")');
+    assert.ok(isRunningPos >= 0, "isRunning(pid) not found in reclaimOrphan body");
+    assert.ok(killGroupPos >= 0, 'killGroup(pid, "SIGKILL") not found in reclaimOrphan body');
+
+    // The alive check must precede the kill so SIGKILL is conditional on liveness.
+    assert.ok(
+      isRunningPos < killGroupPos,
+      `isRunning(pid) (pos ${isRunningPos}) must precede killGroup SIGKILL (pos ${killGroupPos})`,
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 4: reclaimOrphan calls deletePidFile() to clean up after killing
+  // -------------------------------------------------------------------------
+
+  test("reclaimOrphan calls deletePidFile() unconditionally after the kill path", () => {
+    // The PID file must always be cleaned up at the end of reclaimOrphan —
+    // regardless of whether the orphan was alive — so a stale file cannot
+    // trigger a spurious kill on the next launch.
+    const killGroupPos = reclaimOrphanBody.indexOf('killGroup(pid, "SIGKILL")');
+    const lastDeletePos = reclaimOrphanBody.lastIndexOf("await this.deletePidFile()");
+    assert.ok(killGroupPos >= 0, 'killGroup(pid, "SIGKILL") not found in reclaimOrphan body');
+    assert.ok(lastDeletePos >= 0, "await this.deletePidFile() not found in reclaimOrphan body");
+    assert.ok(
+      killGroupPos < lastDeletePos,
+      `killGroup SIGKILL (pos ${killGroupPos}) must precede the final deletePidFile (pos ${lastDeletePos})`,
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 5: reclaimOrphan is called in launch() before spawn()
+  // -------------------------------------------------------------------------
+
+  test("reclaimOrphan is called in launch() before the spawn() call", () => {
+    // Pre-bind orphan reclamation must happen before we attempt to spawn a new
+    // child so the port is free when the new process tries to bind.
+    const reclaimCallPos = launchBody.indexOf("await this.reclaimOrphan()");
+    const spawnCallPos = launchBody.indexOf("const child = spawn(");
+    assert.ok(reclaimCallPos >= 0, "await this.reclaimOrphan() call not found in launch()");
+    assert.ok(spawnCallPos >= 0, "const child = spawn() call not found in launch()");
+    assert.ok(
+      reclaimCallPos < spawnCallPos,
+      `reclaimOrphan() call (pos ${reclaimCallPos}) must precede spawn() call (pos ${spawnCallPos})`,
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 6: ENOENT errors are handled gracefully (just returns)
+  // -------------------------------------------------------------------------
+
+  test("reclaimOrphan handles ENOENT from fs.readFile by returning early without killing", () => {
+    // If no sidecar.pid file exists (first run or prior clean shutdown), the
+    // reclaim path must silently return without attempting any kill.
+    assert.match(
+      reclaimOrphanBody,
+      /code === "ENOENT"[\s\S]{0,60}return;/,
+      'reclaimOrphan must check error.code === "ENOENT" and return early',
+    );
+
+    // The ENOENT early return must precede the killGroup call so that a missing
+    // PID file never results in a kill.
+    const enoentPos = reclaimOrphanBody.indexOf('code === "ENOENT"');
+    const killGroupPos = reclaimOrphanBody.indexOf('killGroup(pid, "SIGKILL")');
+    assert.ok(enoentPos >= 0, 'ENOENT guard not found in reclaimOrphan body');
+    assert.ok(killGroupPos >= 0, 'killGroup call not found in reclaimOrphan body');
+    assert.ok(
+      enoentPos < killGroupPos,
+      `ENOENT guard (pos ${enoentPos}) must precede killGroup (pos ${killGroupPos}) so missing file exits before kill`,
+    );
+  });
+});
+
+// ---------------------------------------------------------------------------
+// T-3.4: Stale log suppression scenario (AC-013) — source-level invariants
+//
+// These tests verify the behavioral invariants that prevent a previous launch's
+// stale waitForHealth resolution from emitting misleading "did not become
+// healthy" logs after a new launch has already started.
+//
+// AC-013: stale log suppression — prev-launch resolves after new-launch race;
+//         misleading "did not become healthy" log does not fire for the stale
+//         context.
+//
+// The race condition: when launch() is called twice in rapid succession (e.g.
+// because handleExit fires a restart while a prior waitForHealth is still
+// polling), the first launch's waitForHealth eventually resolves false after
+// the new child has already been set on this.child. Without the stale guard,
+// the first launch would emit a misleading warn log and call flushReady(false),
+// potentially overwriting the second launch's ready state.
+// ---------------------------------------------------------------------------
+
+describe("T-3.4: stale log suppression scenario source-level invariants (AC-013)", () => {
+  // -------------------------------------------------------------------------
+  // Invariant 1: this.child !== child early-return guard is present in launch()
+  // -------------------------------------------------------------------------
+
+  test("launch() contains the this.child !== child stale guard before the warn log", () => {
+    // The stale guard must be present so that when a second launch() has already
+    // replaced this.child, the first launch's continuation returns immediately
+    // without logging the misleading "did not become healthy" message.
+    assert.match(
+      launchBody,
+      /this\.child !== child/,
+      "launch() must contain the this.child !== child stale guard",
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 2: the stale guard must result in an early return
+  // -------------------------------------------------------------------------
+
+  test("the this.child !== child guard has an early return that precedes the warn log", () => {
+    // The return statement must immediately follow the stale guard check so
+    // the warn log and flushReady(false) are completely skipped for stale launches.
+    assert.match(
+      sidecarSource,
+      /this\.child !== child[\s\S]{0,50}return;/,
+      "this.child !== child guard must be followed by a return statement",
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 3: the stale guard early return precedes the warn log in source
+  // -------------------------------------------------------------------------
+
+  test("the stale guard early return appears before the warn log in launch() body", () => {
+    // Position-based assertion: the early return in the stale guard must come
+    // before the warn log so the warn is unreachable for superseded launches.
+    const staleGuardPos = launchBody.indexOf("this.child !== child");
+    const warnLogPos = launchBody.indexOf(
+      "agent monitor did not become healthy on port",
+    );
+    assert.ok(staleGuardPos >= 0, "this.child !== child not found in launch()");
+    assert.ok(
+      warnLogPos >= 0,
+      "\"agent monitor did not become healthy\" log not found in launch()",
+    );
+    assert.ok(
+      staleGuardPos < warnLogPos,
+      `stale guard (pos ${staleGuardPos}) must precede the warn log (pos ${warnLogPos})`,
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 4: flushReady(false) is skipped when the launch is stale
+  // -------------------------------------------------------------------------
+
+  test("flushReady(false) is only reachable after the stale guard in launch()", () => {
+    // When this.child !== child, the code returns before the flushReady(false)
+    // call, ensuring the newer launch's ready state is not overwritten.
+    // We verify this by asserting the stale guard return precedes flushReady(false).
+    const staleGuardPos = launchBody.indexOf("this.child !== child");
+    // Find the flushReady(false) call that follows the warn log (there may be
+    // earlier flushReady(false) calls in the early-exit paths at the top of launch()).
+    const warnLogPos = launchBody.indexOf("agent monitor did not become healthy");
+    const flushReadyAfterWarn = launchBody.indexOf("this.flushReady(false)", warnLogPos);
+    assert.ok(staleGuardPos >= 0, "stale guard not found in launch() body");
+    assert.ok(warnLogPos >= 0, "warn log not found in launch() body");
+    assert.ok(
+      flushReadyAfterWarn >= 0,
+      "flushReady(false) after warn log not found in launch() body",
+    );
+    // The stale guard must come before flushReady(false), confirming that when the
+    // guard fires and returns early, flushReady(false) is bypassed.
+    assert.ok(
+      staleGuardPos < flushReadyAfterWarn,
+      `stale guard (pos ${staleGuardPos}) must precede flushReady(false) (pos ${flushReadyAfterWarn}) so the call is skipped for stale launches`,
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 5: the guard compares local child against this.child (not this.child against this.child)
+  // -------------------------------------------------------------------------
+
+  test("the stale guard compares the local child variable against this.child", () => {
+    // The guard must reference the closure-captured local `child` variable from
+    // the spawn call — not a stale snapshot of `this.child`. This ensures that
+    // the comparison correctly detects when a newer launch has replaced this.child
+    // after the current launch captured its local reference.
+
+    // The guard must be expressed as `this.child !== child` (this.child on the
+    // left, local child on the right) — not `child !== child` or any other form.
+    assert.match(
+      launchBody,
+      /if \(this\.child !== child\)/,
+      "stale guard must use the exact form `if (this.child !== child)`",
+    );
+
+    // The local `child` variable must be defined in launch() via the spawn() call.
+    assert.match(
+      launchBody,
+      /const child = spawn\(/,
+      "local `child` must be set via spawn() in launch()",
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Invariant 6: the stale guard comment explains the race condition
+  // -------------------------------------------------------------------------
+
+  test("the stale guard has an explanatory comment about the superseded launch", () => {
+    // A comment documenting the race condition makes the invariant auditable
+    // and prevents future maintainers from inadvertently removing the guard.
+    // The comment must appear near the stale guard.
+    assert.match(
+      launchBody,
+      /superseded[\s\S]{0,200}this\.child !== child/,
+      "a comment mentioning \"superseded\" must appear before the stale guard in launch()",
+    );
+  });
+});
diff --git a/apps/desktop/test/agent-monitor-wiring-static.test.ts b/apps/desktop/test/agent-monitor-wiring-static.test.ts
index e00a5b3b..f6c3c2a6 100644
--- a/apps/desktop/test/agent-monitor-wiring-static.test.ts
+++ b/apps/desktop/test/agent-monitor-wiring-static.test.ts
@@ -321,7 +321,7 @@ test("agent monitor defaults on; plan extraction is feature-gated and defaults o
 });
 
 test("sidecar is feature-gated and, when enabled, starts before the gateway", () => {
-  assert.match(appSource, /this\.agentMonitor = new AgentMonitorSidecar\(\)/);
+  assert.match(appSource, /this\.agentMonitor = new AgentMonitorSidecar\([\s\S]*?\)/);
   assert.match(
     appSource,
     /if \(this\.settingsStore\.getAgentMonitorEnabled\(\)\) \{[\s\S]*void this\.agentMonitor\.start\(\);[\s\S]*syncAgentMonitorHooksOnBoot\(\);[\s\S]*this\.agentSessionSync\.start\(\);/,

From dbdc9a6155ac088c424dd9941cc041fa2f0cb92b Mon Sep 17 00:00:00 2001
From: Alexander Ponamarev <alex.ponamaryov@gmail.com>
Date: Thu, 28 May 2026 13:40:23 -0500
Subject: [PATCH 2/6] PLN-745: Reduce sidecar test bloat

- Remove unused future sidecar test harness helpers

- Drop duplicate orphan-recovery invariant checks

Testing: Full desktop test suite, typecheck, and lint passed

Risks: Low; cleanup only removes unused and redundant test code
---
 .../test/agent-monitor-sidecar.test.ts        | 678 +-----------------
 1 file changed, 1 insertion(+), 677 deletions(-)

diff --git a/apps/desktop/test/agent-monitor-sidecar.test.ts b/apps/desktop/test/agent-monitor-sidecar.test.ts
index bd21f597..c5b59799 100644
--- a/apps/desktop/test/agent-monitor-sidecar.test.ts
+++ b/apps/desktop/test/agent-monitor-sidecar.test.ts
@@ -15,20 +15,11 @@
  * agent-monitor-wiring-static.test.ts: they read the source as text and assert
  * the implementation invariants that make each AC hold at runtime.
  *
- * Shared helpers exported from this file are designed to be re-used by
- * integration-level tests once a test-harness shim for the sidecar is
- * available.
  */
 
 import assert from "node:assert/strict";
-import http from "node:http";
-import { EventEmitter } from "node:events";
-import fs from "node:fs/promises";
-import net from "node:net";
-import os from "node:os";
-import path from "node:path";
 import { readFileSync } from "node:fs";
-import { afterEach, describe, mock, test } from "node:test";
+import { describe, test } from "node:test";
 
 // ---------------------------------------------------------------------------
 // Source text fixture (read once at module evaluation time)
@@ -58,312 +49,6 @@ const reclaimOrphanBody = methodBody("private async reclaimOrphan()", 1600);
 const handleExitBody = methodBody("private handleExit(", 1200);
 const launchBody = methodBody("private async launch()", 4000);
 
-// ---------------------------------------------------------------------------
-// Shared helpers: PID file state
-// ---------------------------------------------------------------------------
-
-export interface PidFileRecord {
-  pid: number;
-  sessionToken: string;
-  recordedAt: string;
-}
-
-/**
- * Write a sidecar.pid file to the given userData directory, matching the
- * atomic-write pattern used by writePidFile() in the sidecar source:
- * write a .tmp file then rename it into place.
- */
-export async function writePidFile(
-  userDataDir: string,
-  record: PidFileRecord,
-): Promise<string> {
-  const dir = path.join(userDataDir, "agent-monitor");
-  const pidFile = path.join(dir, "sidecar.pid");
-  const tmpFile = `${pidFile}.tmp`;
-  await fs.mkdir(dir, { recursive: true });
-  await fs.writeFile(tmpFile, JSON.stringify(record), "utf-8");
-  await fs.rename(tmpFile, pidFile);
-  return pidFile;
-}
-
-/**
- * Read and parse a sidecar.pid file from the given userData directory.
- * Returns null if the file does not exist (ENOENT).
- */
-export async function readPidFile(
-  userDataDir: string,
-): Promise<PidFileRecord | null> {
-  const pidFile = path.join(userDataDir, "agent-monitor", "sidecar.pid");
-  try {
-    const raw = await fs.readFile(pidFile, "utf-8");
-    return JSON.parse(raw) as PidFileRecord;
-  } catch (error) {
-    if ((error as NodeJS.ErrnoException).code === "ENOENT") {
-      return null;
-    }
-    throw error;
-  }
-}
-
-/**
- * Assert that no sidecar.pid file exists in the given userData directory.
- */
-export async function assertNoPidFile(userDataDir: string): Promise<void> {
-  const record = await readPidFile(userDataDir);
-  assert.equal(
-    record,
-    null,
-    `expected sidecar.pid to be absent in ${userDataDir}/agent-monitor/`,
-  );
-}
-
-/**
- * Assert that a sidecar.pid file exists and matches the expected fields.
- */
-export async function assertPidFile(
-  userDataDir: string,
-  expected: Partial<PidFileRecord>,
-): Promise<PidFileRecord> {
-  const record = await readPidFile(userDataDir);
-  assert.ok(
-    record !== null,
-    `expected sidecar.pid to exist in ${userDataDir}/agent-monitor/`,
-  );
-  if (expected.pid !== undefined) {
-    assert.equal(record.pid, expected.pid, "PID mismatch in sidecar.pid");
-  }
-  if (expected.sessionToken !== undefined) {
-    assert.equal(
-      record.sessionToken,
-      expected.sessionToken,
-      "sessionToken mismatch in sidecar.pid",
-    );
-  }
-  return record;
-}
-
-// ---------------------------------------------------------------------------
-// Shared helpers: mock HTTP health server
-// ---------------------------------------------------------------------------
-
-export interface HealthServer {
-  /** Port the server is listening on. */
-  port: number;
-  /** Shut down the server and release the port. */
-  close(): Promise<void>;
-}
-
-/**
- * Start a minimal HTTP server that responds 200 OK to GET /api/health.
- * Binds to 127.0.0.1 on the given port (or a random port if 0).
- *
- * Used in tests to simulate a healthy sidecar already on port 4820
- * (foreign process scenario) or to act as the real sidecar in integration
- * tests.
- */
-export function startHealthServer(
-  port: number,
-  opts?: { statusCode?: number; body?: string },
-): Promise<HealthServer> {
-  return new Promise((resolve, reject) => {
-    const statusCode = opts?.statusCode ?? 200;
-    const body = opts?.body ?? "ok";
-    const server = http.createServer((req, res) => {
-      if (req.url === "/api/health") {
-        res.writeHead(statusCode, { "Content-Type": "text/plain" });
-        res.end(body);
-      } else {
-        res.writeHead(404);
-        res.end();
-      }
-    });
-    server.listen(port, "127.0.0.1", () => {
-      const addr = server.address() as net.AddressInfo;
-      resolve({
-        port: addr.port,
-        close: () =>
-          new Promise<void>((res, rej) =>
-            server.close((err) => (err ? rej(err) : res())),
-          ),
-      });
-    });
-    server.once("error", reject);
-  });
-}
-
-/**
- * Check whether a TCP port on 127.0.0.1 is currently accepting connections.
- * Returns true if a connection succeeds within 200ms, false otherwise.
- */
-export function isPortListening(port: number): Promise<boolean> {
-  return new Promise((resolve) => {
-    const socket = new net.Socket();
-    const onError = () => {
-      socket.destroy();
-      resolve(false);
-    };
-    socket.setTimeout(200);
-    socket.once("error", onError);
-    socket.once("timeout", onError);
-    socket.connect(port, "127.0.0.1", () => {
-      socket.destroy();
-      resolve(true);
-    });
-  });
-}
-
-// ---------------------------------------------------------------------------
-// Shared helpers: process.kill spy
-// ---------------------------------------------------------------------------
-
-export interface ProcessKillSpy {
-  /** All recorded kill() calls: [pid, signal] tuples. */
-  calls: Array<[number, string | number]>;
-  /** Restore the original process.kill. */
-  restore(): void;
-}
-
-/**
- * Replace process.kill with a spy that records calls without forwarding
- * to the OS.  The spy does NOT send signals to real processes; use only
- * in unit tests that need to verify that kill() is (or is not) called with
- * specific arguments.
- *
- * Returns a handle with the recorded calls and a restore() method.
- */
-export function spyProcessKill(): ProcessKillSpy {
-  const calls: Array<[number, string | number]> = [];
-  const originalKill = process.kill.bind(process);
-
-  // process.kill signature: (pid: number, signal?: string | number) => true
-  // Cast required because mock.method expects method semantics but process.kill
-  // is a standalone function on the process object.
-  const spy = mock.method(
-    process,
-    "kill",
-    (pid: number, signal?: string | number) => {
-      calls.push([pid, signal ?? 0]);
-      // Return true to match the process.kill return type.
-      return true;
-    },
-  );
-
-  return {
-    calls,
-    restore() {
-      spy.mock.restore();
-      void originalKill; // reference to avoid unused-variable lint
-    },
-  };
-}
-
-// ---------------------------------------------------------------------------
-// Shared helpers: temporary userData directory
-// ---------------------------------------------------------------------------
-
-const tempDirsToClean: string[] = [];
-
-afterEach(async () => {
-  for (const dir of tempDirsToClean.splice(0)) {
-    await fs.rm(dir, { recursive: true, force: true });
-  }
-});
-
-/**
- * Create a temporary directory that acts as `app.getPath("userData")` for a
- * test.  The directory is automatically removed in the afterEach hook.
- */
-export async function makeTempUserDataDir(): Promise<string> {
-  const dir = await fs.mkdtemp(
-    path.join(os.tmpdir(), "sidecar-test-userdata-"),
-  );
-  tempDirsToClean.push(dir);
-  return dir;
-}
-
-// ---------------------------------------------------------------------------
-// Shared helpers: mock gatewayLog
-// ---------------------------------------------------------------------------
-
-export interface GatewayLogSpy {
-  entries: Array<{ level: string; tag: string; message: string }>;
-  reset(): void;
-}
-
-/**
- * Create a lightweight spy that captures log calls.  It does NOT mock the
- * actual gatewayLog singleton (which is an ESM live binding and cannot be
- * intercepted via mock.method in Node's test runner without module loader
- * hooks).  Instead, tests that need to verify log output should pass this
- * spy into the helper functions below that accept a log callback, or assert
- * on log-driven side effects.
- */
-export function createGatewayLogSpy(): GatewayLogSpy {
-  const entries: Array<{ level: string; tag: string; message: string }> = [];
-  return {
-    entries,
-    reset() {
-      entries.length = 0;
-    },
-  };
-}
-
-// ---------------------------------------------------------------------------
-// Shared helpers: mock ChildProcess
-// ---------------------------------------------------------------------------
-
-/**
- * Minimal mock ChildProcess with typed event emitters for stdout, stderr.
- * Matches the interface consumed by pipeLines() and the exit/error handlers
- * in agent-monitor-sidecar.ts.
- */
-export type MockSidecarChild = EventEmitter & {
-  pid: number | undefined;
-  exitCode: number | null;
-  stdout: EventEmitter & { setEncoding(enc: string): void };
-  stderr: EventEmitter & { setEncoding(enc: string): void };
-};
-
-export function buildMockSidecarChild(pid?: number): MockSidecarChild {
-  const child = new EventEmitter() as MockSidecarChild;
-  child.pid = pid;
-  child.exitCode = null;
-
-  const stdout = new EventEmitter() as MockSidecarChild["stdout"];
-  stdout.setEncoding = () => {};
-  child.stdout = stdout;
-
-  const stderr = new EventEmitter() as MockSidecarChild["stderr"];
-  stderr.setEncoding = () => {};
-  child.stderr = stderr;
-
-  return child;
-}
-
-/**
- * Simulate the child exiting by setting exitCode and emitting the "exit"
- * event, matching the Node.js ChildProcess contract.
- */
-export function simulateChildExit(
-  child: MockSidecarChild,
-  code: number | null,
-  signal: NodeJS.Signals | null = null,
-): void {
-  child.exitCode = code ?? 1;
-  child.emit("exit", code, signal);
-}
-
-/**
- * Emit an EADDRINUSE line on the child's stderr stream so the sidecar's
- * stderr handler sets lastExitWasPortConflict = true.
- */
-export function emitEaddrinuse(child: MockSidecarChild): void {
-  child.stderr.emit(
-    "data",
-    "Error: listen EADDRINUSE: address already in use :::4820\n",
-  );
-}
-
 // ---------------------------------------------------------------------------
 // Static verification tests (AC-006 through AC-010 source-level invariants)
 // ---------------------------------------------------------------------------
@@ -519,247 +204,6 @@ describe("agent-monitor-sidecar.ts source-level invariants", () => {
   });
 });
 
-// ---------------------------------------------------------------------------
-// Helpers: fs/promises mock support
-// ---------------------------------------------------------------------------
-
-/**
- * In-memory filesystem store keyed by absolute path.
- * Used by tests that need to intercept fs.readFile / fs.writeFile / fs.rename
- * / fs.unlink / fs.mkdir without touching real disk.
- *
- * This is a lightweight substitute for a full fs mock; tests that need
- * fine-grained filesystem control can use makeTempUserDataDir() instead.
- */
-export class InMemoryFs {
-  private readonly files = new Map<string, string>();
-
-  async readFile(filePath: string, _encoding: "utf-8"): Promise<string> {
-    const content = this.files.get(filePath);
-    if (content === undefined) {
-      const err = Object.assign(
-        new Error(`ENOENT: no such file or directory, open '${filePath}'`),
-        { code: "ENOENT" },
-      );
-      throw err;
-    }
-    return content;
-  }
-
-  async writeFile(
-    filePath: string,
-    data: string,
-    _encoding: "utf-8",
-  ): Promise<void> {
-    this.files.set(filePath, data);
-  }
-
-  async rename(oldPath: string, newPath: string): Promise<void> {
-    const content = this.files.get(oldPath);
-    if (content === undefined) {
-      const err = Object.assign(
-        new Error(`ENOENT: no such file or directory, rename '${oldPath}'`),
-        { code: "ENOENT" },
-      );
-      throw err;
-    }
-    this.files.delete(oldPath);
-    this.files.set(newPath, content);
-  }
-
-  async unlink(filePath: string): Promise<void> {
-    if (!this.files.has(filePath)) {
-      const err = Object.assign(
-        new Error(
-          `ENOENT: no such file or directory, unlink '${filePath}'`,
-        ),
-        { code: "ENOENT" },
-      );
-      throw err;
-    }
-    this.files.delete(filePath);
-  }
-
-  // eslint-disable-next-line @typescript-eslint/no-unused-vars
-  async mkdir(_dirPath: string, _opts?: { recursive?: boolean }): Promise<void> {
-    // no-op: directories are implicit in this in-memory store
-  }
-
-  has(filePath: string): boolean {
-    return this.files.has(filePath);
-  }
-
-  get(filePath: string): string | undefined {
-    return this.files.get(filePath);
-  }
-
-  set(filePath: string, content: string): void {
-    this.files.set(filePath, content);
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Helpers: integration test setup
-// ---------------------------------------------------------------------------
-
-/**
- * Build a PID file record with a known session token and a realistic
- * recordedAt timestamp.  Used by T-3.3 (orphan recovery scenario).
- */
-export function makePidRecord(
-  pid: number,
-  sessionToken = "test-session-token",
-): PidFileRecord {
-  return {
-    pid,
-    sessionToken,
-    recordedAt: new Date().toISOString(),
-  };
-}
-
-/**
- * Poll until a predicate returns true, or throw after the given timeout.
- */
-export async function waitForCondition(
-  predicate: () => boolean | Promise<boolean>,
-  timeoutMs = 5_000,
-  pollMs = 50,
-): Promise<void> {
-  const deadline = Date.now() + timeoutMs;
-  while (!(await predicate())) {
-    if (Date.now() > deadline) {
-      throw new Error(
-        `waitForCondition timed out after ${timeoutMs}ms`,
-      );
-    }
-    await new Promise<void>((resolve) => setTimeout(resolve, pollMs));
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Smoke tests for the shared helpers themselves
-// ---------------------------------------------------------------------------
-
-describe("shared test-harness helpers", () => {
-  test("writePidFile + readPidFile round-trips a record", async () => {
-    const dir = await makeTempUserDataDir();
-    const record = makePidRecord(12345, "tok-abc");
-    await writePidFile(dir, record);
-    const got = await readPidFile(dir);
-    assert.deepEqual(got, record);
-  });
-
-  test("readPidFile returns null when file is absent", async () => {
-    const dir = await makeTempUserDataDir();
-    assert.equal(await readPidFile(dir), null);
-  });
-
-  test("assertNoPidFile does not throw when file is absent", async () => {
-    const dir = await makeTempUserDataDir();
-    await assert.doesNotReject(assertNoPidFile(dir));
-  });
-
-  test("assertNoPidFile throws when file exists", async () => {
-    const dir = await makeTempUserDataDir();
-    await writePidFile(dir, makePidRecord(1));
-    await assert.rejects(assertNoPidFile(dir));
-  });
-
-  test("startHealthServer responds 200 to GET /api/health", async () => {
-    const srv = await startHealthServer(0);
-    try {
-      const res = await fetch(`http://127.0.0.1:${srv.port}/api/health`);
-      assert.equal(res.status, 200);
-    } finally {
-      await srv.close();
-    }
-  });
-
-  test("startHealthServer returns the bound port", async () => {
-    const srv = await startHealthServer(0);
-    try {
-      assert.ok(srv.port > 0 && srv.port < 65536);
-    } finally {
-      await srv.close();
-    }
-  });
-
-  test("isPortListening returns true for a listening server", async () => {
-    const srv = await startHealthServer(0);
-    try {
-      assert.equal(await isPortListening(srv.port), true);
-    } finally {
-      await srv.close();
-    }
-  });
-
-  test("isPortListening returns false for a closed port", async () => {
-    // Use a port that is very unlikely to have a listener.
-    const unusedPort = 39847;
-    assert.equal(await isPortListening(unusedPort), false);
-  });
-
-  test("spyProcessKill records kill calls without forwarding to OS", () => {
-    const spy = spyProcessKill();
-    try {
-      // Call with a synthetic PID that could never be a real process.
-      process.kill(9_999_998 as number, 0 as number);
-      // The spy intercepts the call; the OS never sees it.
-      assert.equal(spy.calls.length, 1);
-      assert.deepEqual(spy.calls[0], [9_999_998, 0]);
-    } finally {
-      spy.restore();
-    }
-  });
-
-  test("InMemoryFs round-trips writeFile + readFile", async () => {
-    const vfs = new InMemoryFs();
-    await vfs.writeFile("/tmp/test.txt", "hello", "utf-8");
-    const content = await vfs.readFile("/tmp/test.txt", "utf-8");
-    assert.equal(content, "hello");
-  });
-
-  test("InMemoryFs rename moves the file", async () => {
-    const vfs = new InMemoryFs();
-    await vfs.writeFile("/tmp/a.tmp", "data", "utf-8");
-    await vfs.rename("/tmp/a.tmp", "/tmp/a.json");
-    assert.equal(vfs.has("/tmp/a.tmp"), false);
-    assert.equal(await vfs.readFile("/tmp/a.json", "utf-8"), "data");
-  });
-
-  test("InMemoryFs unlink removes the file and throws ENOENT on re-read", async () => {
-    const vfs = new InMemoryFs();
-    await vfs.writeFile("/tmp/pid", "x", "utf-8");
-    await vfs.unlink("/tmp/pid");
-    await assert.rejects(
-      () => vfs.readFile("/tmp/pid", "utf-8"),
-      (err: NodeJS.ErrnoException) => err.code === "ENOENT",
-    );
-  });
-
-  test("buildMockSidecarChild emits exit event on simulateChildExit", () => {
-    const child = buildMockSidecarChild(42);
-    let fired = false;
-    child.on("exit", (code: number | null) => {
-      fired = true;
-      assert.equal(code, 1);
-    });
-    simulateChildExit(child, 1);
-    assert.equal(fired, true);
-    assert.equal(child.exitCode, 1);
-  });
-
-  test("emitEaddrinuse triggers data event on child.stderr with EADDRINUSE text", () => {
-    const child = buildMockSidecarChild(99);
-    let captured = "";
-    child.stderr.on("data", (line: string) => {
-      captured = line;
-    });
-    emitEaddrinuse(child);
-    assert.match(captured, /EADDRINUSE/);
-  });
-});
-
 // ---------------------------------------------------------------------------
 // T-3.2: Foreign-process guard scenario (AC-011) — source-level invariants
 //
@@ -944,126 +388,6 @@ describe("T-3.2: foreign-process guard scenario source-level invariants (AC-011)
   });
 });
 
-// ---------------------------------------------------------------------------
-// T-3.3: Orphan recovery scenario (AC-012) — source-level invariants
-//
-// These tests verify the behavioral invariants that make the orphan recovery
-// scenario correct at runtime.  They assert the presence and ordering of
-// critical logic patterns in the sidecar source text.
-//
-// AC-012: orphan recovery — spawn, persist PID, force-kill, restart, orphan
-//         SIGKILLed, new spawn binds port 4820 successfully and reaches ready.
-// ---------------------------------------------------------------------------
-
-describe("T-3.3: orphan recovery scenario source-level invariants (AC-012)", () => {
-  // -------------------------------------------------------------------------
-  // Invariant 1: reclaimOrphan reads the PID file with fs.readFile
-  // -------------------------------------------------------------------------
-
-  test("reclaimOrphan reads sidecar.pid using fs.readFile", () => {
-    // The reclaim path must use fs.readFile to retrieve the persisted PID record
-    // so it can detect a running orphan from a prior session.
-    assert.match(
-      reclaimOrphanBody,
-      /await fs\.readFile\(pidFile, "utf-8"\)/,
-      "reclaimOrphan must call fs.readFile(pidFile, 'utf-8') to read the PID file",
-    );
-  });
-
-  // -------------------------------------------------------------------------
-  // Invariant 2: reclaimOrphan calls isRunning() with the parsed pid
-  // -------------------------------------------------------------------------
-
-  test("reclaimOrphan calls isRunning(pid) with the parsed pid before deciding to kill", () => {
-    // isRunning() is the alive-check that gates the SIGKILL.  Without this check
-    // the reclaim path could kill a reused PID that belongs to a different process.
-    assert.match(
-      reclaimOrphanBody,
-      /isRunning\(pid\)/,
-      "reclaimOrphan must call isRunning(pid) to check whether the orphan is alive",
-    );
-  });
-
-  // -------------------------------------------------------------------------
-  // Invariant 3: reclaimOrphan calls killGroup(pid, "SIGKILL") when isRunning is true
-  // -------------------------------------------------------------------------
-
-  test("reclaimOrphan calls killGroup(pid, 'SIGKILL') inside the isRunning guard", () => {
-    // SIGKILL must only be sent when the recorded PID is confirmed alive via
-    // isRunning(pid).  The kill and the guard must appear in the right order.
-    const isRunningPos = reclaimOrphanBody.indexOf("isRunning(pid)");
-    const killGroupPos = reclaimOrphanBody.indexOf('killGroup(pid, "SIGKILL")');
-    assert.ok(isRunningPos >= 0, "isRunning(pid) not found in reclaimOrphan body");
-    assert.ok(killGroupPos >= 0, 'killGroup(pid, "SIGKILL") not found in reclaimOrphan body');
-
-    // The alive check must precede the kill so SIGKILL is conditional on liveness.
-    assert.ok(
-      isRunningPos < killGroupPos,
-      `isRunning(pid) (pos ${isRunningPos}) must precede killGroup SIGKILL (pos ${killGroupPos})`,
-    );
-  });
-
-  // -------------------------------------------------------------------------
-  // Invariant 4: reclaimOrphan calls deletePidFile() to clean up after killing
-  // -------------------------------------------------------------------------
-
-  test("reclaimOrphan calls deletePidFile() unconditionally after the kill path", () => {
-    // The PID file must always be cleaned up at the end of reclaimOrphan —
-    // regardless of whether the orphan was alive — so a stale file cannot
-    // trigger a spurious kill on the next launch.
-    const killGroupPos = reclaimOrphanBody.indexOf('killGroup(pid, "SIGKILL")');
-    const lastDeletePos = reclaimOrphanBody.lastIndexOf("await this.deletePidFile()");
-    assert.ok(killGroupPos >= 0, 'killGroup(pid, "SIGKILL") not found in reclaimOrphan body');
-    assert.ok(lastDeletePos >= 0, "await this.deletePidFile() not found in reclaimOrphan body");
-    assert.ok(
-      killGroupPos < lastDeletePos,
-      `killGroup SIGKILL (pos ${killGroupPos}) must precede the final deletePidFile (pos ${lastDeletePos})`,
-    );
-  });
-
-  // -------------------------------------------------------------------------
-  // Invariant 5: reclaimOrphan is called in launch() before spawn()
-  // -------------------------------------------------------------------------
-
-  test("reclaimOrphan is called in launch() before the spawn() call", () => {
-    // Pre-bind orphan reclamation must happen before we attempt to spawn a new
-    // child so the port is free when the new process tries to bind.
-    const reclaimCallPos = launchBody.indexOf("await this.reclaimOrphan()");
-    const spawnCallPos = launchBody.indexOf("const child = spawn(");
-    assert.ok(reclaimCallPos >= 0, "await this.reclaimOrphan() call not found in launch()");
-    assert.ok(spawnCallPos >= 0, "const child = spawn() call not found in launch()");
-    assert.ok(
-      reclaimCallPos < spawnCallPos,
-      `reclaimOrphan() call (pos ${reclaimCallPos}) must precede spawn() call (pos ${spawnCallPos})`,
-    );
-  });
-
-  // -------------------------------------------------------------------------
-  // Invariant 6: ENOENT errors are handled gracefully (just returns)
-  // -------------------------------------------------------------------------
-
-  test("reclaimOrphan handles ENOENT from fs.readFile by returning early without killing", () => {
-    // If no sidecar.pid file exists (first run or prior clean shutdown), the
-    // reclaim path must silently return without attempting any kill.
-    assert.match(
-      reclaimOrphanBody,
-      /code === "ENOENT"[\s\S]{0,60}return;/,
-      'reclaimOrphan must check error.code === "ENOENT" and return early',
-    );
-
-    // The ENOENT early return must precede the killGroup call so that a missing
-    // PID file never results in a kill.
-    const enoentPos = reclaimOrphanBody.indexOf('code === "ENOENT"');
-    const killGroupPos = reclaimOrphanBody.indexOf('killGroup(pid, "SIGKILL")');
-    assert.ok(enoentPos >= 0, 'ENOENT guard not found in reclaimOrphan body');
-    assert.ok(killGroupPos >= 0, 'killGroup call not found in reclaimOrphan body');
-    assert.ok(
-      enoentPos < killGroupPos,
-      `ENOENT guard (pos ${enoentPos}) must precede killGroup (pos ${killGroupPos}) so missing file exits before kill`,
-    );
-  });
-});
-
 // ---------------------------------------------------------------------------
 // T-3.4: Stale log suppression scenario (AC-013) — source-level invariants
 //

From b930dee8b5387b987da4a88af9d1f2269d21cd4b Mon Sep 17 00:00:00 2001
From: Alexander Ponamarev <alex.ponamaryov@gmail.com>
Date: Thu, 28 May 2026 14:16:33 -0500
Subject: [PATCH 3/6] PLN-745: Verify sidecar ownership before reclaiming
 orphan PID

- reclaimOrphan now gates SIGKILL on PID-file-independent ownership
  signals: the live process's command line must still run our sidecar
  entry AND its OS start-time must match the value recorded at spawn.
  sessionToken presence alone could not witness ownership (written and
  read by the same record), so a recycled/foreign pid on the fixed port
  could be killed. Now an unverified pid fails safe (skip kill).
- writePidFile persists startTime (ps -o lstart=) as part of the
  ownership identity; add getProcessCommand/getProcessStartTime helpers
  (ps -ww, fail-safe to null, macOS+Linux portable).
- Make lastExitWasPortConflict private to match all sibling fields.
- Widen the test source-window for handleExit (1200->2000) and
  reclaimOrphan (1600->3200) so boundary-straddling assertion targets
  are not silently truncated.

Testing:
- just desktop-typecheck, just desktop-lint: clean
- pnpm -C apps/desktop test: 104/104 pass (39 sidecar, incl. new
  AC-008d/AC-008e ownership regression tests and updated AC-006b)

Risks:
- ps invocation per reclaim attempt; failures degrade to skip-kill, so a
  genuine orphan after an app-path change falls through to EADDRINUSE
  backoff rather than being reclaimed (never blocks boot).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../desktop/src/main/agent-monitor-sidecar.ts | 87 +++++++++++++++++--
 .../test/agent-monitor-sidecar.test.ts        | 49 ++++++++++-
 2 files changed, 127 insertions(+), 9 deletions(-)

diff --git a/apps/desktop/src/main/agent-monitor-sidecar.ts b/apps/desktop/src/main/agent-monitor-sidecar.ts
index 1cf0ad2a..f41ac59a 100644
--- a/apps/desktop/src/main/agent-monitor-sidecar.ts
+++ b/apps/desktop/src/main/agent-monitor-sidecar.ts
@@ -1,10 +1,11 @@
 import { app } from "electron";
-import { spawn, type ChildProcess } from "node:child_process";
+import { spawn, execFile, type ChildProcess } from "node:child_process";
 import { randomUUID } from "node:crypto";
 import { existsSync } from "node:fs";
 import fs from "node:fs/promises";
 import { createRequire } from "node:module";
 import path from "node:path";
+import { promisify } from "node:util";
 
 import { AGENT_MONITOR_PORT } from "../shared/contracts.js";
 import { gatewayLog } from "./gateway-logger.js";
@@ -38,6 +39,7 @@ const RESTART_MAX_DELAY_MS = 30_000;
 // grace + process-group SIGKILL keeps app shutdown within budget.
 const STOP_GRACE_MS = 2_000;
 const requireFromHere = createRequire(import.meta.url);
+const execFileAsync = promisify(execFile);
 
 // Runs the generated Claude-Code-Agent-Monitor runtime tree as a managed
 // localhost child process. The Electron binary is reused as the Node runtime
@@ -57,7 +59,7 @@ export class AgentMonitorSidecar {
   private ready = false;
   private restartAttempts = 0;
   private readyResolvers: Array<(ok: boolean) => void> = [];
-  lastExitWasPortConflict: boolean = false;
+  private lastExitWasPortConflict = false;
   private onTerminalFailure?: (reason: string) => void;
 
   constructor(options?: { onTerminalFailure?: (reason: string) => void }) {
@@ -165,10 +167,16 @@ export class AgentMonitorSidecar {
     }
     let pid: number;
     let sessionToken: string | undefined;
+    let recordedStartTime: string | null;
     try {
-      const parsed = JSON.parse(raw) as { pid: number; sessionToken?: string };
+      const parsed = JSON.parse(raw) as {
+        pid: number;
+        sessionToken?: string;
+        startTime?: string | null;
+      };
       pid = parsed.pid;
       sessionToken = parsed.sessionToken;
+      recordedStartTime = parsed.startTime ?? null;
     } catch (error) {
       gatewayLog.warn(TAG, `failed to parse PID file: ${describe(error)}`);
       await this.deletePidFile();
@@ -191,8 +199,35 @@ export class AgentMonitorSidecar {
       return;
     }
     if (isRunning(pid)) {
-      gatewayLog.info(TAG, `reclaiming orphan sidecar pid=${pid}`);
-      killGroup(pid, "SIGKILL");
+      // sessionToken presence only proves THIS app authored the PID file — it
+      // cannot prove the pid still belongs to our sidecar (the token is written
+      // and read by the same record, so it has no independent witness). PIDs are
+      // recycled, and our port is fixed, so a stale pid may now belong to an
+      // unrelated process. Before SIGKILL we verify ownership against the live
+      // process itself: its command line must still be running our sidecar
+      // entry, and its OS start-time must match what we recorded at spawn. Both
+      // are independent of the PID file, so a recycled/foreign pid fails the
+      // check and is never killed.
+      const { entryFile } = resolveAgentMonitorPaths();
+      const [command, liveStartTime] = await Promise.all([
+        getProcessCommand(pid),
+        getProcessStartTime(pid),
+      ]);
+      const runsOurEntry = command !== null && command.includes(entryFile);
+      // If we could not record a start-time at spawn (ps unavailable), fall back
+      // to the command-line identity alone rather than refusing to ever reclaim.
+      const startTimeMatches =
+        recordedStartTime === null ||
+        (liveStartTime !== null && liveStartTime === recordedStartTime);
+      if (runsOurEntry && startTimeMatches) {
+        gatewayLog.info(TAG, `reclaiming orphan sidecar pid=${pid}`);
+        killGroup(pid, "SIGKILL");
+      } else {
+        gatewayLog.warn(
+          TAG,
+          `pid=${pid} does not match our sidecar identity (recycled or foreign process) — skipping kill`,
+        );
+      }
     }
     await this.deletePidFile();
   }
@@ -203,6 +238,7 @@ export class AgentMonitorSidecar {
     const payload = JSON.stringify({
       pid,
       sessionToken: this.sessionToken,
+      startTime: await getProcessStartTime(pid),
       recordedAt: new Date().toISOString(),
     });
     try {
@@ -487,6 +523,47 @@ function isRunning(pid: number): boolean {
   }
 }
 
+// OS-observed process identity used to confirm a recorded pid still belongs to
+// our sidecar before SIGKILL (see reclaimOrphan). `ps` is available on both
+// macOS (the packaged target) and Linux; on macOS another process's env is not
+// readable and there is no /proc, so the command line + start-time are the only
+// portable, independent ownership signals. Both helpers return null on any
+// failure so the caller fails safe (skip kill) rather than throwing.
+
+// Full argv of `pid` (-ww disables width truncation so a long entry path is
+// never cut off). Used to check the live process is still running our sidecar.
+async function getProcessCommand(pid: number): Promise<string | null> {
+  return queryProcess(pid, "command=");
+}
+
+// OS start-time of `pid`. Stable for the life of a process, so a recycled pid
+// (now a different process) reports a different value than what we recorded.
+async function getProcessStartTime(pid: number): Promise<string | null> {
+  return queryProcess(pid, "lstart=");
+}
+
+async function queryProcess(
+  pid: number,
+  field: "command=" | "lstart=",
+): Promise<string | null> {
+  if (!Number.isInteger(pid) || pid <= 0) {
+    return null;
+  }
+  try {
+    const { stdout } = await execFileAsync("ps", [
+      "-ww",
+      "-p",
+      String(pid),
+      "-o",
+      field,
+    ]);
+    const value = stdout.trim();
+    return value.length > 0 ? value : null;
+  } catch {
+    return null;
+  }
+}
+
 function delay(ms: number): Promise<void> {
   return new Promise((resolve) => setTimeout(resolve, ms));
 }
diff --git a/apps/desktop/test/agent-monitor-sidecar.test.ts b/apps/desktop/test/agent-monitor-sidecar.test.ts
index c5b59799..fcf92bdd 100644
--- a/apps/desktop/test/agent-monitor-sidecar.test.ts
+++ b/apps/desktop/test/agent-monitor-sidecar.test.ts
@@ -45,8 +45,13 @@ function methodBody(signature: string, windowChars: number): string {
   return sidecarSource.slice(idx, idx + windowChars);
 }
 
-const reclaimOrphanBody = methodBody("private async reclaimOrphan()", 1600);
-const handleExitBody = methodBody("private handleExit(", 1200);
+// Windows are sized to comfortably contain the full method body so a
+// boundary-straddling assertion target (e.g. a string near the method's end)
+// is never silently truncated out of the slice. Pad generously; the cost is a
+// few extra chars of unrelated source, the failure mode of being too small is a
+// misleading "not found" that blames production code for a test-window bug.
+const reclaimOrphanBody = methodBody("private async reclaimOrphan()", 3200);
+const handleExitBody = methodBody("private handleExit(", 2000);
 const launchBody = methodBody("private async launch()", 4000);
 
 // ---------------------------------------------------------------------------
@@ -65,8 +70,13 @@ describe("agent-monitor-sidecar.ts source-level invariants", () => {
     );
   });
 
-  test("AC-006b: writePidFile persists { pid, sessionToken, recordedAt } JSON", () => {
-    assert.match(sidecarSource, /pid,\s*sessionToken: this\.sessionToken,\s*recordedAt:/);
+  test("AC-006b: writePidFile persists { pid, sessionToken, startTime, recordedAt } JSON", () => {
+    // startTime (OS process start-time captured at spawn) is part of the
+    // ownership identity reclaimOrphan re-verifies against the live process.
+    assert.match(
+      sidecarSource,
+      /pid,\s*sessionToken: this\.sessionToken,\s*startTime: await getProcessStartTime\(pid\),\s*recordedAt:/,
+    );
   });
 
   test("AC-006c: writePidFile ensures agent-monitor directory exists with mkdir recursive", () => {
@@ -161,6 +171,37 @@ describe("agent-monitor-sidecar.ts source-level invariants", () => {
     assert.doesNotMatch(reclaimOrphanBody, /SIGTERM/);
   });
 
+  test("AC-008d: reclaimOrphan verifies live-process ownership (command + start-time) before SIGKILL", () => {
+    // A live pid is only SIGKILLed when BOTH independent, PID-file-independent
+    // signals confirm it is still our sidecar: its command line runs our entry
+    // file, and its OS start-time matches the value recorded at spawn. This is
+    // the guard that prevents killing a recycled/foreign process holding the
+    // fixed port — sessionToken presence alone is insufficient (it has no
+    // independent witness).
+    assert.match(reclaimOrphanBody, /const runsOurEntry =\s*command !== null && command\.includes\(entryFile\)/);
+    assert.match(reclaimOrphanBody, /liveStartTime !== null && liveStartTime === recordedStartTime/);
+
+    // The ownership check must precede the SIGKILL — the kill is gated on it.
+    const ownershipPos = reclaimOrphanBody.indexOf("runsOurEntry && startTimeMatches");
+    const killGroupPos = reclaimOrphanBody.indexOf('killGroup(pid, "SIGKILL")');
+    assert.ok(ownershipPos >= 0, "ownership check (runsOurEntry && startTimeMatches) not found in reclaimOrphan");
+    assert.ok(killGroupPos >= 0, 'killGroup(pid, "SIGKILL") not found in reclaimOrphan');
+    assert.ok(
+      ownershipPos < killGroupPos,
+      `ownership check (pos ${ownershipPos}) must gate SIGKILL (pos ${killGroupPos})`,
+    );
+  });
+
+  test("AC-008e: reclaimOrphan logs and skips kill when the live process is not our sidecar", () => {
+    // The else-branch of the ownership check must warn and fall through to the
+    // unconditional deletePidFile WITHOUT calling killGroup, so a recycled or
+    // foreign pid is never signalled.
+    assert.match(
+      reclaimOrphanBody,
+      /recycled or foreign process[\s\S]{0,80}skipping kill/,
+    );
+  });
+
   // -------------------------------------------------------------------------
   // AC-009: Terminal failure callback
   // -------------------------------------------------------------------------

From 350574bf9f9de07d53d4a79ad6be23e412d119e8 Mon Sep 17 00:00:00 2001
From: Alexander Ponamarev <alex.ponamaryov@gmail.com>
Date: Thu, 28 May 2026 14:29:40 -0500
Subject: [PATCH 4/6] PLN-745: Persist degraded tray on monitor failure; settle
 reclaim before respawn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses two actionable PR #247 review comments (thadeusb):

- Tray state: the one-shot tray.setState("degraded") in the sidecar
  onTerminalFailure callback was stomped by the next refreshTrayState()
  (cloud heartbeat / gateway recheck), which only branched on
  gatewayHealthy/cloudCommandsPaused/cloudStatus. Add a tracked in-memory
  agentMonitorFailed field (modeled on cloudCommandsPaused); the callback
  latches it and routes through refreshTrayState(), which now consults it
  in a branch ranked just below gateway-down and above the cloud branches
  so the degraded indicator sticks. In-memory only (per-process verdict;
  fresh boot re-attempts the sidecar) — no store schema change.
- Reclaim race: reclaimOrphan SIGKILLed the orphan then returned, and
  launch() respawned immediately; SIGKILL is not synchronous with the OS
  releasing the fixed port, so the first respawn could hit EADDRINUSE.
  Add a bounded isRunning(pid) poll (RECLAIM_WAIT_TIMEOUT_MS, reusing
  delay()/READY_POLL_INTERVAL_MS) after the kill so the port is freed
  before respawn. Bounded so it never stalls the fire-and-forget boot;
  handleExit() backoff remains the fallback.

The third review thread (sessionToken ownership) was already addressed
in b930dee and needs no change.

Testing:
- just desktop-typecheck, just desktop-lint: clean
- pnpm -C apps/desktop test: all pass (added AC-008f bounded-wait
  invariant and a refreshTrayState/agentMonitorFailed wiring test;
  widened reclaimOrphanBody source-window 3200->4000)

Risks:
- Bounded reclaim wait adds up to RECLAIM_WAIT_TIMEOUT_MS (2s) only on
  the path where an orphan was actually killed; skip-kill paths return
  immediately and boot is never blocked.
- agentMonitorFailed latches for the process lifetime (no manual
  re-enable path exists yet); a future re-enable would clear it.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../desktop/src/main/agent-monitor-sidecar.ts | 16 ++++++++
 apps/desktop/src/main/app.ts                  | 27 ++++++++++++-
 .../test/agent-monitor-sidecar.test.ts        | 17 +++++++-
 .../test/agent-monitor-wiring-static.test.ts  | 39 +++++++++++++++++++
 4 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/apps/desktop/src/main/agent-monitor-sidecar.ts b/apps/desktop/src/main/agent-monitor-sidecar.ts
index f41ac59a..4263292a 100644
--- a/apps/desktop/src/main/agent-monitor-sidecar.ts
+++ b/apps/desktop/src/main/agent-monitor-sidecar.ts
@@ -38,6 +38,11 @@ const RESTART_MAX_DELAY_MS = 30_000;
 // forced process.exit(0). DB integrity is already flushed by then, so a short
 // grace + process-group SIGKILL keeps app shutdown within budget.
 const STOP_GRACE_MS = 2_000;
+// Bounds how long reclaimOrphan waits for a SIGKILLed orphan to actually exit
+// (and release the fixed port) before launch() respawns. Kept short — same order
+// as STOP_GRACE_MS — so a lingering pid can never stall the fire-and-forget boot;
+// handleExit()'s exponential-backoff restart loop is the fallback if it times out.
+const RECLAIM_WAIT_TIMEOUT_MS = 2_000;
 const requireFromHere = createRequire(import.meta.url);
 const execFileAsync = promisify(execFile);
 
@@ -222,6 +227,17 @@ export class AgentMonitorSidecar {
       if (runsOurEntry && startTimeMatches) {
         gatewayLog.info(TAG, `reclaiming orphan sidecar pid=${pid}`);
         killGroup(pid, "SIGKILL");
+        // SIGKILL delivery is not synchronous with the OS releasing the orphan's
+        // listening socket on our fixed port. Wait (bounded) for the pid to
+        // actually exit so the imminent respawn in launch() binds on the first
+        // attempt instead of racing a not-yet-released port and hitting
+        // EADDRINUSE. The deadline guarantees this never stalls the
+        // fire-and-forget boot; if the pid lingers past it, handleExit()'s
+        // exponential-backoff restart loop recovers on a later attempt.
+        const deadline = Date.now() + RECLAIM_WAIT_TIMEOUT_MS;
+        while (isRunning(pid) && Date.now() < deadline) {
+          await delay(READY_POLL_INTERVAL_MS);
+        }
       } else {
         gatewayLog.warn(
           TAG,
diff --git a/apps/desktop/src/main/app.ts b/apps/desktop/src/main/app.ts
index 53209411..1315acac 100644
--- a/apps/desktop/src/main/app.ts
+++ b/apps/desktop/src/main/app.ts
@@ -217,6 +217,13 @@ export class DesktopApplication {
   private dangerousAutoApprove = false;
   private cloudStatus: CloudSocketStatus = { state: "idle" };
   private cloudCommandsPaused: boolean;
+  // In-memory supervisor verdict: set once the agent-monitor sidecar gives up
+  // permanently (after MAX_RESTART_ATTEMPTS). refreshTrayState() consults this so
+  // the degraded indicator sticks across later refreshes instead of being reset
+  // to ready by the next cloud heartbeat or gateway recheck. Not persisted — a
+  // fresh boot re-attempts the sidecar, so the verdict is per-process.
+  private agentMonitorFailed = false;
+  private agentMonitorFailureReason: string | null = null;
   private cloudConnectionEnabled: boolean;
   private serverCommandSigningSupported = false;
   private serverAgentSessionSyncSupported = false;
@@ -297,7 +304,13 @@ export class DesktopApplication {
           body: reason,
         });
         notification.show();
-        this.tray.setState("degraded", reason);
+        // Latch the failure and route through refreshTrayState() — the single
+        // owner of tray state — so the degraded indicator survives subsequent
+        // refreshes. A direct tray.setState here would be stomped by the next
+        // refreshTrayState() call (cloud heartbeat, gateway recheck).
+        this.agentMonitorFailed = true;
+        this.agentMonitorFailureReason = reason;
+        this.refreshTrayState();
       },
     });
     this.activityLog = new ActivityLogStore();
@@ -2088,6 +2101,18 @@ export class DesktopApplication {
       return;
     }
 
+    // A permanently-failed agent monitor keeps the tray degraded even when cloud
+    // is online/connecting (gateway-down above remains the higher-severity signal).
+    if (this.agentMonitorFailed) {
+      this.tray.setState(
+        "degraded",
+        explicitDetails ??
+          this.agentMonitorFailureReason ??
+          `Serving on localhost:${this.server.getActivePort()} | agent monitor unavailable`,
+      );
+      return;
+    }
+
     if (this.cloudCommandsPaused) {
       this.tray.setState(
         "degraded",
diff --git a/apps/desktop/test/agent-monitor-sidecar.test.ts b/apps/desktop/test/agent-monitor-sidecar.test.ts
index fcf92bdd..19f87454 100644
--- a/apps/desktop/test/agent-monitor-sidecar.test.ts
+++ b/apps/desktop/test/agent-monitor-sidecar.test.ts
@@ -50,7 +50,7 @@ function methodBody(signature: string, windowChars: number): string {
 // is never silently truncated out of the slice. Pad generously; the cost is a
 // few extra chars of unrelated source, the failure mode of being too small is a
 // misleading "not found" that blames production code for a test-window bug.
-const reclaimOrphanBody = methodBody("private async reclaimOrphan()", 3200);
+const reclaimOrphanBody = methodBody("private async reclaimOrphan()", 4000);
 const handleExitBody = methodBody("private handleExit(", 2000);
 const launchBody = methodBody("private async launch()", 4000);
 
@@ -202,6 +202,21 @@ describe("agent-monitor-sidecar.ts source-level invariants", () => {
     );
   });
 
+  test("AC-008f: reclaimOrphan waits (bounded) for the SIGKILLed orphan to exit before returning", () => {
+    // SIGKILL is not synchronous with the orphan releasing the fixed port, so
+    // reclaimOrphan must poll isRunning(pid) on a bounded deadline after the kill
+    // before launch() respawns — otherwise the first respawn can race a
+    // not-yet-released socket and hit EADDRINUSE. Assert the exact bounded-wait
+    // invariant: a deadline built from the named timeout constant, gating a
+    // delay()-spaced isRunning(pid) poll loop, placed AFTER the SIGKILL.
+    assert.match(
+      reclaimOrphanBody,
+      /killGroup\(pid, "SIGKILL"\);[\s\S]{0,600}const deadline = Date\.now\(\) \+ RECLAIM_WAIT_TIMEOUT_MS;\s*while \(isRunning\(pid\) && Date\.now\(\) < deadline\) \{\s*await delay\(READY_POLL_INTERVAL_MS\);\s*\}/,
+    );
+    // The timeout constant must be defined so the wait is genuinely bounded.
+    assert.match(sidecarSource, /const RECLAIM_WAIT_TIMEOUT_MS = [\d_]+;/);
+  });
+
   // -------------------------------------------------------------------------
   // AC-009: Terminal failure callback
   // -------------------------------------------------------------------------
diff --git a/apps/desktop/test/agent-monitor-wiring-static.test.ts b/apps/desktop/test/agent-monitor-wiring-static.test.ts
index f6c3c2a6..7b2f3a44 100644
--- a/apps/desktop/test/agent-monitor-wiring-static.test.ts
+++ b/apps/desktop/test/agent-monitor-wiring-static.test.ts
@@ -392,6 +392,45 @@ test("hooks are opt-in: default off, silent server auto-install never enabled",
   assert.match(appSource, /syncAgentMonitorHooksOnBoot\(\)/);
 });
 
+test("agent monitor terminal failure sets a tracked degraded state that refreshTrayState consults", () => {
+  // The one-shot tray.setState in onTerminalFailure was being stomped by the
+  // next refreshTrayState() call (cloud heartbeat / gateway recheck), which
+  // only branched on gatewayHealthy / cloudCommandsPaused / cloudStatus. The
+  // degraded indicator must instead be backed by a tracked field so it sticks.
+  // (PR #247 review — thadeusb.)
+
+  // 1. A tracked field exists.
+  assert.match(appSource, /private agentMonitorFailed = false;/);
+
+  // 2. onTerminalFailure latches the field and routes through refreshTrayState()
+  //    rather than calling tray.setState directly (which would be transient).
+  assert.match(
+    appSource,
+    /onTerminalFailure: \(reason: string\) => \{[\s\S]*this\.agentMonitorFailed = true;[\s\S]*this\.refreshTrayState\(\);[\s\S]*\},/,
+  );
+
+  // 3. refreshTrayState() actually consults the field (degraded state is owned
+  //    by the single state owner, not set out-of-band).
+  assert.match(
+    appSource,
+    /private refreshTrayState\([\s\S]*if \(this\.agentMonitorFailed\) \{[\s\S]*this\.tray\.setState\(\s*"degraded"/,
+  );
+
+  // 4. The degraded-monitor branch outranks cloud state: it must appear before
+  //    the cloudStatus "online" branch so an online cloud cannot reset the tray
+  //    to ready while the monitor is dead.
+  const failedBranchIdx = appSource.indexOf("if (this.agentMonitorFailed)");
+  const cloudOnlineBranchIdx = appSource.indexOf(
+    'if (this.cloudStatus.state === "online")',
+  );
+  assert.ok(failedBranchIdx > 0, "agentMonitorFailed branch not found in refreshTrayState");
+  assert.ok(cloudOnlineBranchIdx > 0, "cloud online branch not found in refreshTrayState");
+  assert.ok(
+    failedBranchIdx < cloudOnlineBranchIdx,
+    "agentMonitorFailed branch must precede the cloud-online branch so the degraded indicator is not overwritten",
+  );
+});
+
 test("shutdown sequence stops the sidecar before the server", () => {
   assert.match(shutdownSource, /agentMonitor: \{ stop:/);
   assert.match(

From 94f6d176500dc4a1c5dc32f2e7072240d59667c6 Mon Sep 17 00:00:00 2001
From: Alexander Ponamarev <alex.ponamaryov@gmail.com>
Date: Thu, 28 May 2026 14:35:08 -0500
Subject: [PATCH 5/6] PLN-745: Persist sidecar PID before readiness

- Write sidecar PID file immediately after spawn succeeds

- Update invariant coverage for startup-window orphan recovery

Testing: Focused sidecar tests, typecheck, and lint passed

Risks: Low; readiness remains gated by health and stability checks
---
 .../desktop/src/main/agent-monitor-sidecar.ts |  2 +-
 .../test/agent-monitor-sidecar.test.ts        | 21 ++++++++++++-------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/apps/desktop/src/main/agent-monitor-sidecar.ts b/apps/desktop/src/main/agent-monitor-sidecar.ts
index 4263292a..483f2369 100644
--- a/apps/desktop/src/main/agent-monitor-sidecar.ts
+++ b/apps/desktop/src/main/agent-monitor-sidecar.ts
@@ -321,6 +321,7 @@ export class AgentMonitorSidecar {
       TAG,
       `starting agent monitor pid=${child.pid} port=${this.port}`,
     );
+    await this.writePidFile(child.pid);
 
     pipeLines(child.stdout, (line) => gatewayLog.debug(TAG, line));
     pipeLines(child.stderr, (line) => {
@@ -349,7 +350,6 @@ export class AgentMonitorSidecar {
           TAG,
           `agent monitor ready at http://${HOST}:${this.port}`,
         );
-        await this.writePidFile(child.pid!);
         this.flushReady(true);
         return;
       }
diff --git a/apps/desktop/test/agent-monitor-sidecar.test.ts b/apps/desktop/test/agent-monitor-sidecar.test.ts
index 19f87454..eeba19e4 100644
--- a/apps/desktop/test/agent-monitor-sidecar.test.ts
+++ b/apps/desktop/test/agent-monitor-sidecar.test.ts
@@ -60,7 +60,7 @@ const launchBody = methodBody("private async launch()", 4000);
 
 describe("agent-monitor-sidecar.ts source-level invariants", () => {
   // -------------------------------------------------------------------------
-  // AC-006: PID file lifecycle — write on stability, delete on stop()
+  // AC-006: PID file lifecycle — write after spawn, delete on stop()
   // -------------------------------------------------------------------------
 
   test("AC-006a: writePidFile uses atomic rename (write .tmp then rename)", () => {
@@ -103,12 +103,19 @@ describe("agent-monitor-sidecar.ts source-level invariants", () => {
     );
   });
 
-  test("AC-006f: writePidFile is called only after the stability window confirms readiness", () => {
-    // writePidFile must be called AFTER delay(READY_STABILITY_WINDOW_MS) and
-    // inside the isChildAliveAndCurrent check, not unconditionally after spawn.
-    assert.match(
-      sidecarSource,
-      /await delay\(READY_STABILITY_WINDOW_MS\)[\s\S]{0,400}await this\.writePidFile\(child\.pid!/,
+  test("AC-006f: writePidFile is called after spawn before health waits", () => {
+    // The PID file must be written as soon as a child pid exists, before
+    // waitForHealth() and the stability window, so a force-quit during startup
+    // leaves enough metadata for the next launch to reclaim the orphan.
+    const pidGuardPos = launchBody.indexOf("if (!child.pid)");
+    const writePidPos = launchBody.indexOf("await this.writePidFile(child.pid)");
+    const waitForHealthPos = launchBody.indexOf("const healthy = await this.waitForHealth(child)");
+    assert.ok(pidGuardPos >= 0, "child.pid guard not found in launch()");
+    assert.ok(writePidPos >= 0, "writePidFile(child.pid) not found in launch()");
+    assert.ok(waitForHealthPos >= 0, "waitForHealth(child) not found in launch()");
+    assert.ok(
+      pidGuardPos < writePidPos && writePidPos < waitForHealthPos,
+      "writePidFile(child.pid) must run after the pid guard and before waitForHealth(child)",
     );
   });
 

From 73ed14dcd19cb472a75e120334890496941a0852 Mon Sep 17 00:00:00 2001
From: Alexander Ponamarev <17112266+aponamarev@users.noreply.github.com>
Date: Thu, 28 May 2026 14:39:59 -0500
Subject: [PATCH 6/6] Bump version from 0.15.91 to 0.15.92

---
 apps/desktop/package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/desktop/package.json b/apps/desktop/package.json
index f1c318a1..3efdaa61 100644
--- a/apps/desktop/package.json
+++ b/apps/desktop/package.json
@@ -1,6 +1,6 @@
 {
   "name": "desktop",
-  "version": "0.15.91",
+  "version": "0.15.92",
   "description": "ClosedLoop Desktop",
   "author": "ClosedLoop AI <support@closedloop.ai>",
   "private": true,