From a6283371745e79a0321d974a67aa7071f983a863 Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 07:30:02 -0600
Subject: [PATCH 01/14] feat: add quick-checks validation for fast
 typecheck/build feedback

Restructure validation into composable steps so typecheck (~5s) runs
independently before full validation. Quick checks short-circuit on
typecheck failure and format errors as actionable agent prompts,
laying the foundation for the agent retry loop.
---
 src/lib/agent-runner.ts                 |  16 +-
 src/lib/events.ts                       |   7 +
 src/lib/validation/build-validator.ts   |   6 +-
 src/lib/validation/index.ts             |  12 +-
 src/lib/validation/quick-checks.spec.ts | 273 +++++++++++++++++++++++
 src/lib/validation/quick-checks.ts      | 274 ++++++++++++++++++++++++
 src/lib/validation/types.ts             |  17 ++
 src/lib/validation/validator.ts         |  39 ++--
 8 files changed, 625 insertions(+), 19 deletions(-)
 create mode 100644 src/lib/validation/quick-checks.spec.ts
 create mode 100644 src/lib/validation/quick-checks.ts
diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts
index b6fcabf..79a453e 100644
--- a/src/lib/agent-runner.ts
+++ b/src/lib/agent-runner.ts
@@ -1,5 +1,5 @@
 import { SPINNER_MESSAGE, type FrameworkConfig } from './framework-config.js';
-import { validateInstallation } from './validation/index.js';
+import { validateInstallation, runQuickChecks } from './validation/index.js';
 import type { InstallerOptions } from '../utils/types.js';
 import {
   ensurePackageIsInstalled,
@@ -135,10 +135,22 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal
 
   // Run post-installation validation
   if (!options.noValidate) {
+    // Quick checks: fast typecheck + build before full validation
+    options.emitter?.emit('validation:quick:start', {});
+
+    const quickCheckResult = await runQuickChecks(options.installDir);
+
+    options.emitter?.emit('validation:quick:complete', {
+      passed: quickCheckResult.passed,
+      results: quickCheckResult.results,
+      durationMs: quickCheckResult.totalDurationMs,
+    });
+
+    // Full validation — skip build since quick checks already ran it
     options.emitter?.emit('validation:start', { framework: config.metadata.integration });
 
     const validationResult = await validateInstallation(config.metadata.integration, options.installDir, {
-      runBuild: true,
+      runBuild: false,
     });
 
     if (validationResult.issues.length > 0) {
diff --git a/src/lib/events.ts b/src/lib/events.ts
index cec5cc9..91458cf 100644
--- a/src/lib/events.ts
+++ b/src/lib/events.ts
@@ -53,6 +53,13 @@ export interface InstallerEvents {
   'agent:success': { summary?: string };
   'agent:failure': { message: string; stack?: string };
 
+  'validation:quick:start': Record<string, never>;
+  'validation:quick:complete': {
+    passed: boolean;
+    results: import('./validation/types.js').QuickCheckResult[];
+    durationMs: number;
+  };
+
   'validation:start': { framework: string };
   'validation:issues': { issues: import('./validation/types.js').ValidationIssue[] };
   'validation:complete': { passed: boolean; issueCount: number; durationMs: number };
diff --git a/src/lib/validation/build-validator.ts b/src/lib/validation/build-validator.ts
index 8debd15..2e5fc9f 100644
--- a/src/lib/validation/build-validator.ts
+++ b/src/lib/validation/build-validator.ts
@@ -99,13 +99,13 @@ export async function runBuildValidation(projectDir: string, timeoutMs: number =
   });
 }
 
-function detectPackageManager(projectDir: string): 'pnpm' | 'yarn' | 'npm' {
+export function detectPackageManager(projectDir: string): 'pnpm' | 'yarn' | 'npm' {
   if (existsSync(join(projectDir, 'pnpm-lock.yaml'))) return 'pnpm';
   if (existsSync(join(projectDir, 'yarn.lock'))) return 'yarn';
   return 'npm';
 }
 
-async function hasBuildScriptInPackageJson(projectDir: string): Promise<boolean> {
+export async function hasBuildScriptInPackageJson(projectDir: string): Promise<boolean> {
   try {
     const content = await readFile(join(projectDir, 'package.json'), 'utf-8');
     const pkg = JSON.parse(content) as { scripts?: { build?: string } };
@@ -115,7 +115,7 @@ async function hasBuildScriptInPackageJson(projectDir: string): Promise<boolean>
   }
 }
 
-function parseBuildErrors(output: string): string[] {
+export function parseBuildErrors(output: string): string[] {
   const errors: string[] = [];
 
   // TypeScript errors: "file.ts(line,col): error TS..."
diff --git a/src/lib/validation/index.ts b/src/lib/validation/index.ts
index 49e74d2..c450b97 100644
--- a/src/lib/validation/index.ts
+++ b/src/lib/validation/index.ts
@@ -1,5 +1,13 @@
-export { validateInstallation, type ValidateOptions } from './validator.js';
+export {
+  validateInstallation,
+  validatePackages,
+  validateEnvVars,
+  validateFiles,
+  validateFrameworkSpecific,
+  type ValidateOptions,
+} from './validator.js';
 export { runBuildValidation, type BuildResult } from './build-validator.js';
+export { runQuickChecks, runTypecheckValidation } from './quick-checks.js';
 export type {
   ValidationResult,
   ValidationRules,
@@ -10,4 +18,6 @@ export type {
   EnvVarRule,
   FileRule,
   VariantRules,
+  QuickCheckResult,
+  QuickChecksOutput,
 } from './types.js';
diff --git a/src/lib/validation/quick-checks.spec.ts b/src/lib/validation/quick-checks.spec.ts
new file mode 100644
index 0000000..888c2b9
--- /dev/null
+++ b/src/lib/validation/quick-checks.spec.ts
@@ -0,0 +1,273 @@
+import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
+import { mkdtempSync, writeFileSync, rmSync } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+import { EventEmitter } from 'node:events';
+
+// Mock child_process.spawn to avoid actually running tsc/build
+vi.mock('child_process', () => ({
+  spawn: vi.fn(),
+}));
+
+import { spawn } from 'child_process';
+import { runQuickChecks, runTypecheckValidation } from './quick-checks.js';
+
+const mockSpawn = vi.mocked(spawn);
+
+/**
+ * Creates a mock process lazily — must be used inside mockImplementationOnce,
+ * NOT mockReturnValueOnce, so the setTimeout fires after event listeners attach.
+ */
+function createMockProcess(exitCode: number, stdout = '', stderr = '') {
+  const proc = new EventEmitter() as any;
+  proc.stdout = new EventEmitter();
+  proc.stderr = new EventEmitter();
+
+  setTimeout(() => {
+    if (stdout) proc.stdout.emit('data', Buffer.from(stdout));
+    if (stderr) proc.stderr.emit('data', Buffer.from(stderr));
+    proc.emit('close', exitCode);
+  }, 10);
+
+  return proc;
+}
+
+describe('runQuickChecks', () => {
+  let testDir: string;
+
+  beforeEach(() => {
+    testDir = mkdtempSync(join(tmpdir(), 'quick-checks-test-'));
+    writeFileSync(
+      join(testDir, 'package.json'),
+      JSON.stringify({
+        scripts: { typecheck: 'tsc --noEmit', build: 'next build' },
+      }),
+    );
+    writeFileSync(join(testDir, 'pnpm-lock.yaml'), '');
+    mockSpawn.mockReset();
+  });
+
+  afterEach(() => {
+    rmSync(testDir, { recursive: true, force: true });
+  });
+
+  it('returns passed=true when both typecheck and build succeed', async () => {
+    mockSpawn
+      .mockImplementationOnce(() => createMockProcess(0))
+      .mockImplementationOnce(() => createMockProcess(0));
+
+    const result = await runQuickChecks(testDir);
+
+    expect(result.passed).toBe(true);
+    expect(result.results).toHaveLength(2);
+    expect(result.results[0].phase).toBe('typecheck');
+    expect(result.results[1].phase).toBe('build');
+    expect(result.agentRetryPrompt).toBeNull();
+  });
+
+  it('short-circuits build when typecheck fails', async () => {
+    const tsError = "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable";
+
+    mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError));
+
+    const result = await runQuickChecks(testDir);
+
+    expect(result.passed).toBe(false);
+    expect(result.results).toHaveLength(1);
+    expect(result.results[0].phase).toBe('typecheck');
+    expect(result.results[0].passed).toBe(false);
+    expect(mockSpawn).toHaveBeenCalledTimes(1);
+  });
+
+  it('runs build after typecheck passes', async () => {
+    mockSpawn
+      .mockImplementationOnce(() => createMockProcess(0))
+      .mockImplementationOnce(() => createMockProcess(0));
+
+    const result = await runQuickChecks(testDir);
+
+    expect(result.passed).toBe(true);
+    expect(result.results).toHaveLength(2);
+    expect(mockSpawn).toHaveBeenCalledTimes(2);
+  });
+
+  it('skips build when skipBuild option is true', async () => {
+    mockSpawn.mockImplementationOnce(() => createMockProcess(0));
+
+    const result = await runQuickChecks(testDir, { skipBuild: true });
+
+    expect(result.passed).toBe(true);
+    expect(result.results).toHaveLength(1);
+    expect(result.results[0].phase).toBe('typecheck');
+    expect(mockSpawn).toHaveBeenCalledTimes(1);
+  });
+
+  it('generates agentRetryPrompt when typecheck fails', async () => {
+    const tsError = "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'.";
+    mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError));
+
+    const result = await runQuickChecks(testDir);
+
+    expect(result.agentRetryPrompt).not.toBeNull();
+    expect(result.agentRetryPrompt).toContain('typecheck failed');
+    expect(result.agentRetryPrompt).toContain('src/middleware.ts');
+  });
+
+  it('tracks total duration', async () => {
+    mockSpawn
+      .mockImplementationOnce(() => createMockProcess(0))
+      .mockImplementationOnce(() => createMockProcess(0));
+
+    const result = await runQuickChecks(testDir);
+
+    expect(typeof result.totalDurationMs).toBe('number');
+    expect(result.totalDurationMs).toBeGreaterThanOrEqual(0);
+  });
+
+  it('reports build failure when typecheck passes but build fails', async () => {
+    mockSpawn
+      .mockImplementationOnce(() => createMockProcess(0)) // typecheck pass
+      .mockImplementationOnce(() => createMockProcess(1, '', 'Error: Build failed')); // build fail
+
+    const result = await runQuickChecks(testDir);
+
+    expect(result.passed).toBe(false);
+    expect(result.results).toHaveLength(2);
+    expect(result.results[0].passed).toBe(true);
+    expect(result.results[1].passed).toBe(false);
+    expect(result.results[1].phase).toBe('build');
+    expect(result.agentRetryPrompt).toContain('build failed');
+  });
+});
+
+describe('runTypecheckValidation', () => {
+  let testDir: string;
+
+  beforeEach(() => {
+    testDir = mkdtempSync(join(tmpdir(), 'typecheck-test-'));
+    writeFileSync(
+      join(testDir, 'package.json'),
+      JSON.stringify({
+        scripts: { typecheck: 'tsc --noEmit' },
+      }),
+    );
+    writeFileSync(join(testDir, 'pnpm-lock.yaml'), '');
+    mockSpawn.mockReset();
+  });
+
+  afterEach(() => {
+    rmSync(testDir, { recursive: true, force: true });
+  });
+
+  it('returns passed=true when typecheck succeeds', async () => {
+    mockSpawn.mockImplementationOnce(() => createMockProcess(0));
+
+    const result = await runTypecheckValidation(testDir);
+
+    expect(result.passed).toBe(true);
+    expect(result.phase).toBe('typecheck');
+    expect(result.issues).toHaveLength(0);
+    expect(result.agentPrompt).toBeNull();
+  });
+
+  it('parses TypeScript errors from output', async () => {
+    const tsError =
+      "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'.";
+    mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError));
+
+    const result = await runTypecheckValidation(testDir);
+
+    expect(result.passed).toBe(false);
+    expect(result.issues.length).toBeGreaterThan(0);
+    expect(result.issues[0].message).toContain('Type error');
+    expect(result.issues[0].severity).toBe('error');
+  });
+
+  it('formats errors into actionable agent prompt', async () => {
+    const tsError =
+      "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'.";
+    mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError));
+
+    const result = await runTypecheckValidation(testDir);
+
+    expect(result.agentPrompt).not.toBeNull();
+    expect(result.agentPrompt).toContain('src/middleware.ts');
+    expect(result.agentPrompt).toContain('not assignable');
+  });
+
+  it('handles pretty-printed tsc errors (colon-separated format)', async () => {
+    const tsError =
+      "src/app.tsx:10:3 - error TS2322: Type 'number' is not assignable to type 'string'.";
+    mockSpawn.mockImplementationOnce(() => createMockProcess(1, tsError, ''));
+
+    const result = await runTypecheckValidation(testDir);
+
+    expect(result.passed).toBe(false);
+    expect(result.issues.length).toBeGreaterThan(0);
+  });
+
+  it('provides fallback message when errors cannot be parsed', async () => {
+    mockSpawn.mockImplementationOnce(() =>
+      createMockProcess(1, '', 'Some unknown error format that we cannot parse'),
+    );
+
+    const result = await runTypecheckValidation(testDir);
+
+    expect(result.passed).toBe(false);
+    expect(result.issues).toHaveLength(1);
+    expect(result.issues[0].message).toBe('Typecheck failed');
+  });
+
+  it('uses typecheck script from package.json when available', async () => {
+    mockSpawn.mockImplementationOnce(() => createMockProcess(0));
+
+    await runTypecheckValidation(testDir);
+
+    expect(mockSpawn).toHaveBeenCalledWith(
+      'pnpm',
+      ['typecheck'],
+      expect.objectContaining({ cwd: testDir }),
+    );
+  });
+
+  it('falls back to npx tsc --noEmit when no typecheck script', async () => {
+    writeFileSync(
+      join(testDir, 'package.json'),
+      JSON.stringify({ scripts: { build: 'next build' } }),
+    );
+    mockSpawn.mockImplementationOnce(() => createMockProcess(0));
+
+    await runTypecheckValidation(testDir);
+
+    expect(mockSpawn).toHaveBeenCalledWith(
+      'npx',
+      ['tsc', '--noEmit'],
+      expect.objectContaining({ cwd: testDir }),
+    );
+  });
+
+  it('detects type-check script (hyphenated variant)', async () => {
+    writeFileSync(
+      join(testDir, 'package.json'),
+      JSON.stringify({ scripts: { 'type-check': 'tsc --noEmit' } }),
+    );
+    mockSpawn.mockImplementationOnce(() => createMockProcess(0));
+
+    await runTypecheckValidation(testDir);
+
+    expect(mockSpawn).toHaveBeenCalledWith(
+      'pnpm',
+      ['type-check'],
+      expect.objectContaining({ cwd: testDir }),
+    );
+  });
+
+  it('tracks duration', async () => {
+    mockSpawn.mockImplementationOnce(() => createMockProcess(0));
+
+    const result = await runTypecheckValidation(testDir);
+
+    expect(typeof result.durationMs).toBe('number');
+    expect(result.durationMs).toBeGreaterThanOrEqual(0);
+  });
+});
diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts
new file mode 100644
index 0000000..1a0b3a2
--- /dev/null
+++ b/src/lib/validation/quick-checks.ts
@@ -0,0 +1,274 @@
+import { spawn } from 'child_process';
+import { readFile } from 'fs/promises';
+import { join } from 'path';
+import type { QuickCheckResult, QuickChecksOutput, ValidationIssue } from './types.js';
+import { detectPackageManager, parseBuildErrors, runBuildValidation } from './build-validator.js';
+
+const DEFAULT_TYPECHECK_TIMEOUT_MS = 30_000;
+const DEFAULT_BUILD_TIMEOUT_MS = 60_000;
+
+/**
+ * Run fast deterministic checks: typecheck first, then build.
+ * Short-circuits: if typecheck fails, skip build (build will fail too).
+ */
+export async function runQuickChecks(
+  projectDir: string,
+  options?: { skipBuild?: boolean; timeoutMs?: number },
+): Promise<QuickChecksOutput> {
+  const startTime = Date.now();
+  const results: QuickCheckResult[] = [];
+
+  // Step 1: Typecheck
+  const typecheckResult = await runTypecheckValidation(
+    projectDir,
+    options?.timeoutMs ?? DEFAULT_TYPECHECK_TIMEOUT_MS,
+  );
+  results.push(typecheckResult);
+
+  // Step 2: Build — only if typecheck passed and build not skipped
+  if (typecheckResult.passed && !options?.skipBuild) {
+    const buildResult = await runBuildQuickCheck(projectDir, options?.timeoutMs ?? DEFAULT_BUILD_TIMEOUT_MS);
+    results.push(buildResult);
+  }
+
+  const passed = results.every((r) => r.passed);
+
+  return {
+    passed,
+    results,
+    agentRetryPrompt: passed ? null : formatForAgent(results),
+    totalDurationMs: Date.now() - startTime,
+  };
+}
+
+/**
+ * Run typecheck only (tsc --noEmit or framework equivalent).
+ * Faster than full build — catches type errors in ~5s.
+ */
+export async function runTypecheckValidation(
+  projectDir: string,
+  timeoutMs: number = DEFAULT_TYPECHECK_TIMEOUT_MS,
+): Promise<QuickCheckResult> {
+  const startTime = Date.now();
+  const typecheckCmd = await detectTypecheckCommand(projectDir);
+
+  if (!typecheckCmd) {
+    // No typecheck available — pass through
+    return {
+      passed: true,
+      phase: 'typecheck',
+      issues: [],
+      agentPrompt: null,
+      durationMs: Date.now() - startTime,
+    };
+  }
+
+  const { exitCode, stdout, stderr } = await spawnCommand(
+    typecheckCmd.command,
+    typecheckCmd.args,
+    projectDir,
+    timeoutMs,
+  );
+
+  if (exitCode === 0) {
+    return {
+      passed: true,
+      phase: 'typecheck',
+      issues: [],
+      agentPrompt: null,
+      durationMs: Date.now() - startTime,
+    };
+  }
+
+  const output = stdout + stderr;
+  const errors = parseTypecheckErrors(output);
+  const issues: ValidationIssue[] = errors.map((error) => ({
+    type: 'file' as const,
+    severity: 'error' as const,
+    message: `Type error: ${error}`,
+    hint: 'Fix the type error and run typecheck again',
+  }));
+
+  // Fallback if no specific errors parsed
+  if (issues.length === 0) {
+    issues.push({
+      type: 'file',
+      severity: 'error',
+      message: 'Typecheck failed',
+      hint: `Run \`${typecheckCmd.command} ${typecheckCmd.args.join(' ')}\` to see full output`,
+    });
+  }
+
+  const agentPrompt = formatTypecheckErrors(errors, output);
+
+  return {
+    passed: false,
+    phase: 'typecheck',
+    issues,
+    agentPrompt,
+    durationMs: Date.now() - startTime,
+  };
+}
+
+/**
+ * Run build as a quick check, wrapping the existing runBuildValidation.
+ */
+async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promise<QuickCheckResult> {
+  const buildResult = await runBuildValidation(projectDir, timeoutMs);
+
+  return {
+    passed: buildResult.success,
+    phase: 'build',
+    issues: buildResult.issues,
+    agentPrompt: buildResult.success ? null : formatBuildErrors(buildResult.issues),
+    durationMs: buildResult.durationMs,
+  };
+}
+
+interface TypecheckCommand {
+  command: string;
+  args: string[];
+}
+
+/**
+ * Detect the appropriate typecheck command for the project.
+ * Checks for tsc in node_modules, then framework-specific alternatives.
+ */
+async function detectTypecheckCommand(projectDir: string): Promise<TypecheckCommand | null> {
+  const pm = detectPackageManager(projectDir);
+
+  // Check for typecheck script in package.json first
+  try {
+    const content = await readFile(join(projectDir, 'package.json'), 'utf-8');
+    const pkg = JSON.parse(content) as { scripts?: Record<string, string> };
+
+    if (pkg.scripts?.typecheck) {
+      const args = pm === 'npm' ? ['run', 'typecheck'] : ['typecheck'];
+      return { command: pm, args };
+    }
+
+    if (pkg.scripts?.['type-check']) {
+      const args = pm === 'npm' ? ['run', 'type-check'] : ['type-check'];
+      return { command: pm, args };
+    }
+  } catch {
+    // No package.json or malformed — continue detection
+  }
+
+  // Fallback: use npx tsc --noEmit
+  return { command: 'npx', args: ['tsc', '--noEmit'] };
+}
+
+/**
+ * Parse TypeScript-specific errors from typecheck output.
+ */
+function parseTypecheckErrors(output: string): string[] {
+  const errors: string[] = [];
+
+  // TypeScript errors: "src/file.ts(line,col): error TS2345: ..."
+  const tsErrors = output.match(/[\w./]+\.\w+\(\d+,\d+\):\s*error\s+TS\d+:.+/g);
+  if (tsErrors) {
+    errors.push(...tsErrors.slice(0, 10));
+  }
+
+  // Also match "src/file.ts:line:col - error TS2345: ..." (tsc --pretty format)
+  const prettyErrors = output.match(/[\w./]+\.\w+:\d+:\d+\s*-\s*error\s+TS\d+:.+/g);
+  if (prettyErrors) {
+    // Dedupe with existing errors
+    for (const err of prettyErrors.slice(0, 10)) {
+      if (!errors.some((e) => e.includes(err.split(':')[0]))) {
+        errors.push(err);
+      }
+    }
+  }
+
+  return errors.slice(0, 10);
+}
+
+/**
+ * Format typecheck errors into an agent-ready prompt.
+ * Turns "TS2345: Argument of type..." into actionable instructions.
+ */
+function formatTypecheckErrors(errors: string[], rawOutput: string): string {
+  if (errors.length === 0) {
+    // Couldn't parse specific errors — give raw output
+    const truncated = rawOutput.slice(0, 2000);
+    return `The typecheck failed. Here is the output:\n\n${truncated}\n\nFix the type errors shown above.`;
+  }
+
+  const lines = errors.map((error) => {
+    // Extract file:line info and error description
+    const fileMatch = error.match(/([\w./]+\.\w+)[:(]\d+/);
+    const tsMatch = error.match(/error\s+(TS\d+):\s*(.+)/);
+
+    if (fileMatch && tsMatch) {
+      return `- ${fileMatch[1]}: ${tsMatch[2]} (${tsMatch[1]})`;
+    }
+    return `- ${error}`;
+  });
+
+  return `The typecheck failed with ${errors.length} error${errors.length === 1 ? '' : 's'}:\n\n${lines.join('\n')}\n\nFix these type errors in the indicated files.`;
+}
+
+/**
+ * Format build errors into an agent-ready prompt.
+ */
+function formatBuildErrors(issues: ValidationIssue[]): string {
+  const errorMessages = issues.map((i) => `- ${i.message}`);
+  return `The build failed:\n\n${errorMessages.join('\n')}\n\nFix these build errors.`;
+}
+
+/**
+ * Format quick check failures into an agent-ready prompt.
+ * Combines typecheck and build errors into a single actionable prompt.
+ */
+function formatForAgent(results: QuickCheckResult[]): string {
+  const failedResults = results.filter((r) => !r.passed);
+  if (failedResults.length === 0) return '';
+
+  const parts: string[] = [];
+
+  for (const result of failedResults) {
+    if (result.agentPrompt) {
+      parts.push(result.agentPrompt);
+    }
+  }
+
+  return parts.join('\n\n');
+}
+
+/**
+ * Spawn a command and collect output.
+ */
+function spawnCommand(
+  command: string,
+  args: string[],
+  cwd: string,
+  timeoutMs: number,
+): Promise<{ exitCode: number; stdout: string; stderr: string }> {
+  return new Promise((resolve) => {
+    const proc = spawn(command, args, {
+      cwd,
+      shell: true,
+      timeout: timeoutMs,
+    });
+
+    let stdout = '';
+    let stderr = '';
+
+    proc.stdout?.on('data', (data: Buffer) => {
+      stdout += data.toString();
+    });
+    proc.stderr?.on('data', (data: Buffer) => {
+      stderr += data.toString();
+    });
+
+    proc.on('close', (code) => {
+      resolve({ exitCode: code ?? 1, stdout, stderr });
+    });
+
+    proc.on('error', () => {
+      resolve({ exitCode: 1, stdout, stderr });
+    });
+  });
+}
diff --git a/src/lib/validation/types.ts b/src/lib/validation/types.ts
index e3675fb..25a5ea5 100644
--- a/src/lib/validation/types.ts
+++ b/src/lib/validation/types.ts
@@ -47,5 +47,22 @@ export interface ValidationRules {
   variants?: Record<string, VariantRules>;
 }
 
+export interface QuickCheckResult {
+  passed: boolean;
+  phase: 'typecheck' | 'build';
+  issues: ValidationIssue[];
+  /** Formatted for agent consumption — actionable, not just error messages */
+  agentPrompt: string | null;
+  durationMs: number;
+}
+
+export interface QuickChecksOutput {
+  passed: boolean;
+  results: QuickCheckResult[];
+  /** Combined agent-ready prompt summarizing all failures */
+  agentRetryPrompt: string | null;
+  totalDurationMs: number;
+}
+
 // Re-export BuildResult from build-validator
 export type { BuildResult } from './build-validator.js';
diff --git a/src/lib/validation/validator.ts b/src/lib/validation/validator.ts
index 772adba..b35a2fa 100644
--- a/src/lib/validation/validator.ts
+++ b/src/lib/validation/validator.ts
@@ -30,12 +30,12 @@ export async function validateInstallation(
   }
 
   // Run validations
-  await validatePackages(rules, projectDir, issues);
-  await validateEnvVars(rules, projectDir, issues);
-  await validateFiles(rules, projectDir, issues);
+  issues.push(...await validatePackages(rules, projectDir));
+  issues.push(...await validateEnvVars(rules, projectDir));
+  issues.push(...await validateFiles(rules, projectDir));
 
   // Run framework-specific cross-validations
-  await validateFrameworkSpecific(framework, projectDir, issues);
+  issues.push(...await validateFrameworkSpecific(framework, projectDir));
 
   // Run build validation if enabled
   if (options.runBuild !== false) {
@@ -74,16 +74,17 @@ async function loadRules(framework: string, variant?: string): Promise<Validatio
   }
 }
 
-async function validatePackages(rules: ValidationRules, projectDir: string, issues: ValidationIssue[]): Promise<void> {
+export async function validatePackages(rules: ValidationRules, projectDir: string): Promise<ValidationIssue[]> {
+  const issues: ValidationIssue[] = [];
   const pkgPath = join(projectDir, 'package.json');
-  if (!existsSync(pkgPath)) return;
+  if (!existsSync(pkgPath)) return issues;
 
   let pkg: Record<string, unknown>;
   try {
     pkg = JSON.parse(await readFile(pkgPath, 'utf-8'));
   } catch {
     // Malformed package.json - skip package validation
-    return;
+    return issues;
   }
 
   const deps = (pkg.dependencies || {}) as Record<string, string>;
@@ -103,9 +104,12 @@ async function validatePackages(rules: ValidationRules, projectDir: string, issu
       });
     }
   }
+
+  return issues;
 }
 
-async function validateEnvVars(rules: ValidationRules, projectDir: string, issues: ValidationIssue[]): Promise<void> {
+export async function validateEnvVars(rules: ValidationRules, projectDir: string): Promise<ValidationIssue[]> {
+  const issues: ValidationIssue[] = [];
   const envPath = join(projectDir, '.env.local');
   let envContent = '';
 
@@ -120,7 +124,7 @@ async function validateEnvVars(rules: ValidationRules, projectDir: string, issue
         hint: 'Create .env.local with required environment variables',
       });
     }
-    return;
+    return issues;
   }
 
   for (const rule of rules.envVars) {
@@ -144,9 +148,13 @@ async function validateEnvVars(rules: ValidationRules, projectDir: string, issue
       });
     }
   }
+
+  return issues;
 }
 
-async function validateFiles(rules: ValidationRules, projectDir: string, issues: ValidationIssue[]): Promise<void> {
+export async function validateFiles(rules: ValidationRules, projectDir: string): Promise<ValidationIssue[]> {
+  const issues: ValidationIssue[] = [];
+
   for (const rule of rules.files) {
     let matches: string[];
     try {
@@ -205,16 +213,19 @@ async function validateFiles(rules: ValidationRules, projectDir: string, issues:
       }
     }
   }
+
+  return issues;
 }
 
 /**
  * Framework-specific cross-validations that require reading multiple sources.
  */
-async function validateFrameworkSpecific(
+export async function validateFrameworkSpecific(
   framework: string,
   projectDir: string,
-  issues: ValidationIssue[],
-): Promise<void> {
+): Promise<ValidationIssue[]> {
+  const issues: ValidationIssue[] = [];
+
   // Universal cross-validations
   await validateCredentialFormats(projectDir, issues);
   await validateDuplicateEnvVars(projectDir, issues);
@@ -238,6 +249,8 @@ async function validateFrameworkSpecific(
       await validateCookiePasswordLength(projectDir, issues, 'WORKOS_COOKIE_PASSWORD');
       break;
   }
+
+  return issues;
 }
 
 /**

From 8436ed413b852f90bb6fd2b5dcfe90dd83936e1c Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 07:43:59 -0600
Subject: [PATCH 02/14] feat: add retry loop for agent self-correction on
 validation failures

Extend the async generator in agent-interface to yield follow-up
correction prompts when quick-checks (typecheck/build) fail. The agent
retains full conversation context and gets up to 2 chances to fix its
own mistakes before results surface to the user. Configurable via
maxRetries option (default 2, 0 to disable).
---
 src/lib/agent-interface.spec.ts | 306 ++++++++++++++++++++++++++++++++
 src/lib/agent-interface.ts      |  90 ++++++++--
 src/lib/agent-runner.ts         |  41 +++--
 src/lib/events.ts               |   4 +
 src/utils/types.ts              |   7 +
 5 files changed, 419 insertions(+), 29 deletions(-)
 create mode 100644 src/lib/agent-interface.spec.ts

diff --git a/src/lib/agent-interface.spec.ts b/src/lib/agent-interface.spec.ts
new file mode 100644
index 0000000..d627276
--- /dev/null
+++ b/src/lib/agent-interface.spec.ts
@@ -0,0 +1,306 @@
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { EventEmitter } from 'node:events';
+
+const { mockQuery, mockConfig } = vi.hoisted(() => ({
+  mockQuery: vi.fn(),
+  mockConfig: {
+    model: 'test-model',
+    workos: { clientId: 'client_test', authkitDomain: 'test.workos.com', llmGatewayUrl: 'http://localhost:8000' },
+    telemetry: { enabled: false, eventName: 'test_event' },
+    proxy: { refreshThresholdMs: 300000 },
+    nodeVersion: '20',
+    logging: { debugMode: false },
+    documentation: { workosDocsUrl: 'https://workos.com/docs', dashboardUrl: 'https://dashboard.workos.com', issuesUrl: 'https://github.com' },
+    frameworks: {},
+    legacy: { oauthPort: 3000 },
+    branding: { showAsciiArt: false, asciiArt: '', compactAsciiArt: '', useCompact: false },
+  },
+}));
+
+vi.mock('@anthropic-ai/claude-agent-sdk', () => ({
+  query: (...args: unknown[]) => mockQuery(...args),
+}));
+
+vi.mock('../utils/debug.js', () => ({
+  debug: vi.fn(),
+  logInfo: vi.fn(),
+  logWarn: vi.fn(),
+  logError: vi.fn(),
+  initLogFile: vi.fn(),
+  getLogFilePath: vi.fn(() => null),
+}));
+
+vi.mock('../utils/analytics.js', () => ({
+  analytics: {
+    capture: vi.fn(),
+    setTag: vi.fn(),
+    shutdown: vi.fn(),
+    llmRequest: vi.fn(),
+    incrementAgentIterations: vi.fn(),
+    toolCalled: vi.fn(),
+  },
+}));
+
+vi.mock('./settings.js', () => ({
+  getConfig: vi.fn(() => mockConfig),
+  getAuthkitDomain: vi.fn(() => 'test.workos.com'),
+  getCliAuthClientId: vi.fn(() => 'client_test'),
+}));
+
+vi.mock('./credentials.js', () => ({
+  hasCredentials: vi.fn(() => false),
+  getCredentials: vi.fn(() => null),
+}));
+
+vi.mock('./token-refresh.js', () => ({
+  ensureValidToken: vi.fn(async () => ({ success: true })),
+}));
+
+vi.mock('./credential-proxy.js', () => ({
+  startCredentialProxy: vi.fn(),
+}));
+
+vi.mock('../utils/urls.js', () => ({
+  getLlmGatewayUrlFromHost: vi.fn(() => 'http://localhost:8000'),
+}));
+
+import { runAgent, type RetryConfig } from './agent-interface.js';
+import { InstallerEventEmitter } from './events.js';
+import type { InstallerOptions } from '../utils/types.js';
+
+/**
+ * Create a mock SDK response that consumes the prompt stream and yields
+ * responses for each prompt message. This models the real SDK behavior:
+ * the response generator stays alive as long as prompts keep coming.
+ */
+function createMockSDKResponse(turns: Array<{ text?: string; error?: boolean }>) {
+  return function mockQueryImpl({ prompt }: { prompt: AsyncIterable<unknown>; options: unknown }) {
+    let turnIndex = 0;
+
+    async function* responseGenerator() {
+      // Consume each prompt message and respond with the corresponding turn
+      for await (const _promptMsg of prompt) {
+        if (turnIndex >= turns.length) continue;
+
+        const turn = turns[turnIndex];
+        turnIndex++;
+
+        if (turn.text) {
+          yield {
+            type: 'assistant',
+            message: {
+              content: [{ type: 'text', text: turn.text }],
+              usage: { input_tokens: 100, output_tokens: 50 },
+              model: 'test-model',
+            },
+          };
+        }
+
+        yield {
+          type: 'result',
+          subtype: turn.error ? 'error' : 'success',
+          result: turn.text ?? '',
+          ...(turn.error ? { errors: ['Test error'] } : {}),
+        };
+      }
+    }
+
+    return responseGenerator();
+  };
+}
+
+function makeAgentConfig() {
+  return {
+    workingDirectory: '/tmp/test',
+    mcpServers: {},
+    model: 'test-model',
+    allowedTools: [],
+    sdkEnv: {},
+  };
+}
+
+function makeOptions(overrides: Partial<InstallerOptions> = {}): InstallerOptions {
+  return {
+    debug: false,
+    forceInstall: false,
+    installDir: '/tmp/test',
+    local: true,
+    ci: false,
+    skipAuth: true,
+    ...overrides,
+  };
+}
+
+describe('runAgent retry loop', () => {
+  let emitter: InstallerEventEmitter;
+  let emittedEvents: Array<{ event: string; payload: unknown }>;
+
+  beforeEach(() => {
+    mockQuery.mockReset();
+    emitter = new InstallerEventEmitter();
+    emittedEvents = [];
+
+    // Capture all events
+    const originalEmit = emitter.emit.bind(emitter);
+    emitter.emit = ((event: string, payload: unknown) => {
+      emittedEvents.push({ event, payload });
+      return originalEmit(event, payload);
+    }) as typeof emitter.emit;
+  });
+
+  it('returns retryCount=0 when no retryConfig provided', async () => {
+    mockQuery.mockImplementation(
+      createMockSDKResponse([{ text: 'Done!' }]),
+    );
+
+    const result = await runAgent(
+      makeAgentConfig(),
+      'Test prompt',
+      makeOptions(),
+      undefined,
+      emitter,
+    );
+
+    expect(result.error).toBeUndefined();
+    expect(result.retryCount).toBe(0);
+  });
+
+  it('returns retryCount=0 when validation passes first try', async () => {
+    mockQuery.mockImplementation(
+      createMockSDKResponse([{ text: 'Done!' }]),
+    );
+
+    const validateAndFormat = vi.fn().mockResolvedValue(null); // passes
+
+    const result = await runAgent(
+      makeAgentConfig(),
+      'Test prompt',
+      makeOptions(),
+      undefined,
+      emitter,
+      { maxRetries: 2, validateAndFormat },
+    );
+
+    expect(result.error).toBeUndefined();
+    expect(result.retryCount).toBe(0);
+    expect(validateAndFormat).toHaveBeenCalledTimes(1);
+
+    // Should emit validation:retry:start and validation:retry:complete
+    const retryStartEvents = emittedEvents.filter((e) => e.event === 'validation:retry:start');
+    const retryCompleteEvents = emittedEvents.filter((e) => e.event === 'validation:retry:complete');
+    expect(retryStartEvents).toHaveLength(1);
+    expect(retryCompleteEvents).toHaveLength(1);
+    expect(retryCompleteEvents[0].payload).toEqual({ attempt: 1, passed: true });
+
+    // Should NOT emit agent:retry (no retry happened)
+    const retryEvents = emittedEvents.filter((e) => e.event === 'agent:retry');
+    expect(retryEvents).toHaveLength(0);
+  });
+
+  it('retries once when validation fails then passes', async () => {
+    // Two turns: initial + one retry
+    mockQuery.mockImplementation(
+      createMockSDKResponse([
+        { text: 'Initial attempt' },
+        { text: 'Fixed it!' },
+      ]),
+    );
+
+    const validateAndFormat = vi.fn()
+      .mockResolvedValueOnce('Type error in src/foo.ts') // fail first
+      .mockResolvedValueOnce(null); // pass second
+
+    const result = await runAgent(
+      makeAgentConfig(),
+      'Test prompt',
+      makeOptions(),
+      undefined,
+      emitter,
+      { maxRetries: 2, validateAndFormat },
+    );
+
+    expect(result.error).toBeUndefined();
+    expect(result.retryCount).toBe(1);
+    expect(validateAndFormat).toHaveBeenCalledTimes(2);
+
+    // Should emit agent:retry once
+    const retryEvents = emittedEvents.filter((e) => e.event === 'agent:retry');
+    expect(retryEvents).toHaveLength(1);
+    expect(retryEvents[0].payload).toEqual({ attempt: 1, maxRetries: 2 });
+  });
+
+  it('caps at maxRetries when validation always fails', async () => {
+    // Three turns: initial + 2 retries
+    mockQuery.mockImplementation(
+      createMockSDKResponse([
+        { text: 'Attempt 1' },
+        { text: 'Attempt 2' },
+        { text: 'Attempt 3' },
+      ]),
+    );
+
+    const validateAndFormat = vi.fn().mockResolvedValue('Still broken');
+
+    const result = await runAgent(
+      makeAgentConfig(),
+      'Test prompt',
+      makeOptions(),
+      undefined,
+      emitter,
+      { maxRetries: 2, validateAndFormat },
+    );
+
+    expect(result.error).toBeUndefined();
+    expect(result.retryCount).toBe(2);
+    // Called 2 times: after initial + after retry 1
+    // NOT called after retry 2 because the loop exits
+    expect(validateAndFormat).toHaveBeenCalledTimes(2);
+
+    const retryEvents = emittedEvents.filter((e) => e.event === 'agent:retry');
+    expect(retryEvents).toHaveLength(2);
+  });
+
+  it('preserves existing behavior with maxRetries=0', async () => {
+    mockQuery.mockImplementation(
+      createMockSDKResponse([{ text: 'Done!' }]),
+    );
+
+    const validateAndFormat = vi.fn().mockResolvedValue('Error');
+
+    const result = await runAgent(
+      makeAgentConfig(),
+      'Test prompt',
+      makeOptions(),
+      undefined,
+      emitter,
+      { maxRetries: 0, validateAndFormat },
+    );
+
+    expect(result.error).toBeUndefined();
+    expect(result.retryCount).toBe(0);
+    // validateAndFormat should never be called with maxRetries=0
+    expect(validateAndFormat).not.toHaveBeenCalled();
+  });
+
+  it('treats validateAndFormat errors as passed', async () => {
+    mockQuery.mockImplementation(
+      createMockSDKResponse([{ text: 'Done!' }]),
+    );
+
+    const validateAndFormat = vi.fn().mockRejectedValue(new Error('Validation crashed'));
+
+    const result = await runAgent(
+      makeAgentConfig(),
+      'Test prompt',
+      makeOptions(),
+      undefined,
+      emitter,
+      { maxRetries: 2, validateAndFormat },
+    );
+
+    expect(result.error).toBeUndefined();
+    expect(result.retryCount).toBe(0);
+    // Should have been called once, threw, treated as passed
+    expect(validateAndFormat).toHaveBeenCalledTimes(1);
+  });
+});
diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts
index 9022b3a..856a29a 100644
--- a/src/lib/agent-interface.ts
+++ b/src/lib/agent-interface.ts
@@ -72,6 +72,13 @@ export type AgentConfig = {
   workOSApiHost: string;
 };
 
+export interface RetryConfig {
+  /** Max correction attempts after initial run. Default: 2 */
+  maxRetries: number;
+  /** Run between agent turns. Return null if passed, or error prompt if failed. */
+  validateAndFormat: (workingDirectory: string) => Promise<string | null>;
+}
+
 /**
  * Internal configuration object returned by initializeAgent
  */
@@ -489,7 +496,8 @@ export async function runAgent(
     errorMessage?: string;
   },
   emitter?: InstallerEventEmitter,
-): Promise<{ error?: AgentErrorType; errorMessage?: string }> {
+  retryConfig?: RetryConfig,
+): Promise<{ error?: AgentErrorType; errorMessage?: string; retryCount?: number }> {
   const {
     spinnerMessage = 'Setting up WorkOS AuthKit...',
     successMessage = 'WorkOS AuthKit integration complete',
@@ -509,24 +517,73 @@ export async function runAgent(
   const collectedText: string[] = [];
 
   try {
-    // Workaround for SDK bug: stdin closes before canUseTool responses can be sent.
-    // The fix is to use an async generator for the prompt that stays open until
-    // the result is received, keeping the stdin stream alive for permission responses.
-    // See: https://github.com/anthropics/claude-code/issues/4775
-    // See: https://github.com/anthropics/claude-agent-sdk-typescript/issues/41
-    let signalDone: () => void;
-    const resultReceived = new Promise<void>((resolve) => {
-      signalDone = resolve;
-    });
+    // Retry loop coordination
+    let retryCount = 0;
+    const maxRetries = retryConfig?.maxRetries ?? 0;
+
+    // Turn completion signals — the response loop resolves currentTurnDone
+    // when a 'result' message arrives. The generator awaits it between turns.
+    let resolveCurrentTurn!: () => void;
+    let currentTurnDone!: Promise<void>;
+
+    function resetTurnSignal() {
+      currentTurnDone = new Promise<void>((resolve) => {
+        resolveCurrentTurn = resolve;
+      });
+    }
+    resetTurnSignal();
 
     const createPromptStream = async function* () {
+      // Initial prompt
       yield {
         type: 'user',
         session_id: '',
         message: { role: 'user', content: prompt },
         parent_tool_use_id: null,
       };
-      await resultReceived;
+
+      // Retry loop — yield follow-up correction prompts on validation failure
+      if (retryConfig && maxRetries > 0) {
+        while (retryCount < maxRetries) {
+          // Wait for agent to finish current turn
+          await currentTurnDone;
+
+          // Run validation between turns
+          emitter?.emit('validation:retry:start', { attempt: retryCount + 1 });
+
+          let validationPrompt: string | null;
+          try {
+            validationPrompt = await retryConfig.validateAndFormat(agentConfig.workingDirectory);
+          } catch (err) {
+            // Don't block on validation bugs — treat as passed
+            logError('validateAndFormat threw:', err);
+            validationPrompt = null;
+          }
+
+          emitter?.emit('validation:retry:complete', {
+            attempt: retryCount + 1,
+            passed: validationPrompt === null,
+          });
+
+          if (validationPrompt === null) break; // Validation passed
+
+          retryCount++;
+          emitter?.emit('agent:retry', { attempt: retryCount, maxRetries });
+
+          resetTurnSignal();
+
+          // Feed errors back to agent in same conversation
+          yield {
+            type: 'user',
+            session_id: '',
+            message: { role: 'user', content: validationPrompt },
+            parent_tool_use_id: null,
+          };
+        }
+      }
+
+      // Keep generator alive until the final result is received
+      await currentTurnDone;
     };
 
     // Load plugin with bundled skills
@@ -570,9 +627,9 @@ export async function runAgent(
       if (messageError) {
         sdkError = messageError;
       }
-      // Signal completion when result received
+      // Signal turn completion when result received — this resumes the generator
       if (message.type === 'result') {
-        signalDone!();
+        resolveCurrentTurn();
       }
     }
 
@@ -597,15 +654,18 @@ export async function runAgent(
       return { error: AgentErrorType.RESOURCE_MISSING, errorMessage: 'Could not access setup resource' };
     }
 
-    logInfo(`Agent run completed in ${Math.round(durationMs / 1000)}s`);
+    logInfo(`Agent run completed in ${Math.round(durationMs / 1000)}s (${retryCount} retries)`);
     analytics.capture(INSTALLER_INTERACTION_EVENT_NAME, {
       action: 'agent integration completed',
       duration_ms: durationMs,
       duration_seconds: Math.round(durationMs / 1000),
+      retry_count: retryCount,
+      max_retries: maxRetries,
+      passed_after_retry: retryCount > 0,
     });
 
     // Don't emit agent:success here - let the state machine handle lifecycle events
-    return {};
+    return { retryCount };
   } catch (error) {
     // Don't emit events here - just log and re-throw for state machine to handle
     logError('Agent run failed:', error);
diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts
index 79a453e..b2eef7d 100644
--- a/src/lib/agent-runner.ts
+++ b/src/lib/agent-runner.ts
@@ -9,7 +9,7 @@ import {
 } from '../utils/clack-utils.js';
 import { analytics } from '../utils/analytics.js';
 import { INSTALLER_INTERACTION_EVENT_NAME } from './constants.js';
-import { initializeAgent, runAgent } from './agent-interface.js';
+import { initializeAgent, runAgent, type RetryConfig } from './agent-interface.js';
 import { uploadEnvironmentVariablesStep } from '../steps/index.js';
 import { autoConfigureWorkOSEnvironment } from './workos-management.js';
 import { detectPort, getCallbackPath } from './port-detection.js';
@@ -113,7 +113,20 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal
     options,
   );
 
-  // Run agent - errors will throw naturally with skill-based approach
+  // Build validation callback for retry loop — uses quick checks from Phase 1
+  const validateAndFormat = async (workingDirectory: string): Promise<string | null> => {
+    const quickResult = await runQuickChecks(workingDirectory);
+    return quickResult.passed ? null : quickResult.agentRetryPrompt;
+  };
+
+  // Build retry config
+  const retryConfig: RetryConfig | undefined =
+    options.noValidate ? undefined : {
+      maxRetries: options.maxRetries ?? 2,
+      validateAndFormat,
+    };
+
+  // Run agent with retry support — agent gets correction prompts on validation failure
   const agentResult = await runAgent(
     agent,
     integrationPrompt,
@@ -124,6 +137,7 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal
       errorMessage: 'Integration failed',
     },
     options.emitter,
+    retryConfig,
   );
 
   // If agent returned an error, throw so state machine can handle it
@@ -133,20 +147,19 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal
     throw new Error(`Agent SDK error: ${message}`);
   }
 
-  // Run post-installation validation
-  if (!options.noValidate) {
-    // Quick checks: fast typecheck + build before full validation
-    options.emitter?.emit('validation:quick:start', {});
-
-    const quickCheckResult = await runQuickChecks(options.installDir);
-
-    options.emitter?.emit('validation:quick:complete', {
-      passed: quickCheckResult.passed,
-      results: quickCheckResult.results,
-      durationMs: quickCheckResult.totalDurationMs,
+  // Track retry metrics
+  if (agentResult.retryCount !== undefined && agentResult.retryCount > 0) {
+    analytics.capture(INSTALLER_INTERACTION_EVENT_NAME, {
+      action: 'agent retry summary',
+      retry_count: agentResult.retryCount,
+      max_retries: options.maxRetries ?? 2,
+      passed_after_retry: true,
     });
+  }
 
-    // Full validation — skip build since quick checks already ran it
+  // Run full validation after agent (with retries) completes
+  // Quick checks already ran inside the retry loop — skip build
+  if (!options.noValidate) {
     options.emitter?.emit('validation:start', { framework: config.metadata.integration });
 
     const validationResult = await validateInstallation(config.metadata.integration, options.installDir, {
diff --git a/src/lib/events.ts b/src/lib/events.ts
index 91458cf..e0a2279 100644
--- a/src/lib/events.ts
+++ b/src/lib/events.ts
@@ -52,6 +52,10 @@ export interface InstallerEvents {
   'agent:progress': { step: string; detail?: string };
   'agent:success': { summary?: string };
   'agent:failure': { message: string; stack?: string };
+  'agent:retry': { attempt: number; maxRetries: number };
+
+  'validation:retry:start': { attempt: number };
+  'validation:retry:complete': { attempt: number; passed: boolean };
 
   'validation:quick:start': Record<string, never>;
   'validation:quick:complete': {
diff --git a/src/utils/types.ts b/src/utils/types.ts
index cb54d76..901a05c 100644
--- a/src/utils/types.ts
+++ b/src/utils/types.ts
@@ -91,6 +91,13 @@ export type InstallerOptions = {
    * Requires ANTHROPIC_API_KEY environment variable.
    */
   direct?: boolean;
+
+  /**
+   * Max correction attempts after initial agent run.
+   * The agent gets this many chances to fix validation failures (typecheck/build).
+   * Default: 2. Set to 0 to disable retries entirely.
+   */
+  maxRetries?: number;
 };
 
 export interface Feature {

From c0ad5ae3a7a5de43d33a7c38fd56495725523599 Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 07:56:24 -0600
Subject: [PATCH 03/14] feat: add within-session correction metrics to evals
 framework

Add retry-aware execution to AgentExecutor using the same async
generator + quick-checks pattern from production. Evals now track
three tiers: first-attempt, with-correction, and with-retry pass
rates. Adds --no-correction flag to disable for baseline comparison.
---
 tests/evals/agent-executor.ts   | 85 +++++++++++++++++++++++++++++++--
 tests/evals/cli.ts              |  6 +++
 tests/evals/index.ts            |  1 +
 tests/evals/parallel-runner.ts  |  7 ++-
 tests/evals/reporter.ts         |  3 +-
 tests/evals/runner.ts           |  9 +++-
 tests/evals/success-criteria.ts | 20 ++++++--
 tests/evals/types.ts            |  4 ++
 8 files changed, 125 insertions(+), 10 deletions(-)

diff --git a/tests/evals/agent-executor.ts b/tests/evals/agent-executor.ts
index 3c4b0cd..5a9b135 100644
--- a/tests/evals/agent-executor.ts
+++ b/tests/evals/agent-executor.ts
@@ -7,6 +7,7 @@ import { writeEnvLocal } from '../../src/lib/env-writer.js';
 import { parseEnvFile } from '../../src/utils/env-parser.js';
 import { getConfig } from '../../src/lib/settings.js';
 import { LatencyTracker } from './latency-tracker.js';
+import { runQuickChecks } from '../../src/lib/validation/quick-checks.js';
 import type { ToolCall, LatencyMetrics } from './types.js';
 
 export interface AgentResult {
@@ -15,6 +16,17 @@ export interface AgentResult {
   toolCalls: ToolCall[];
   error?: string;
   latencyMetrics?: LatencyMetrics;
+  /** Number of within-session correction attempts */
+  correctionAttempts: number;
+  /** Whether the agent self-corrected after an initial failure */
+  selfCorrected: boolean;
+}
+
+export interface AgentRetryConfig {
+  /** Enable within-session correction. Default: true */
+  enabled: boolean;
+  /** Max correction attempts. Default: 2 */
+  maxRetries: number;
 }
 
 export interface AgentExecutorOptions {
@@ -77,7 +89,8 @@ export class AgentExecutor {
     this.latencyTracker = new LatencyTracker();
   }
 
-  async run(): Promise<AgentResult> {
+  async run(retryConfig?: AgentRetryConfig): Promise<AgentResult> {
+    const config = retryConfig ?? { enabled: true, maxRetries: 2 };
     const integration = this.getIntegration();
     const toolCalls: ToolCall[] = [];
     const collectedOutput: string[] = [];
@@ -106,6 +119,22 @@ export class AgentExecutor {
     const skillName = SKILL_NAMES[integration];
     const prompt = this.buildPrompt(skillName);
 
+    // Retry loop coordination
+    let correctionAttempts = 0;
+    const maxRetries = config.enabled ? config.maxRetries : 0;
+    const workDir = this.workDir;
+
+    // Turn completion signals
+    let resolveCurrentTurn!: () => void;
+    let currentTurnDone!: Promise<void>;
+
+    function resetTurnSignal() {
+      currentTurnDone = new Promise<void>((resolve) => {
+        resolveCurrentTurn = resolve;
+      });
+    }
+    resetTurnSignal();
+
     // Initialize and run agent
     try {
       const { query } = await import('@anthropic-ai/claude-agent-sdk');
@@ -126,8 +155,51 @@ export class AgentExecutor {
       const __dirname = path.dirname(__filename);
       const pluginPath = path.join(__dirname, '../..');
 
+      // Retry-aware prompt stream (same pattern as production agent-interface.ts)
+      const createPromptStream = async function* () {
+        yield {
+          type: 'user',
+          session_id: '',
+          message: { role: 'user', content: prompt },
+          parent_tool_use_id: null,
+        };
+
+        if (maxRetries > 0) {
+          while (correctionAttempts < maxRetries) {
+            await currentTurnDone;
+
+            let validationPrompt: string | null;
+            try {
+              const quickResult = await runQuickChecks(workDir);
+              validationPrompt = quickResult.passed ? null : quickResult.agentRetryPrompt;
+            } catch {
+              validationPrompt = null; // treat validation errors as passed
+            }
+
+            if (validationPrompt === null) break;
+
+            correctionAttempts++;
+            if (label && process.env.EVAL_VERBOSE) {
+              console.log(`${label} Correction attempt ${correctionAttempts}/${maxRetries}`);
+            }
+
+            resetTurnSignal();
+
+            yield {
+              type: 'user',
+              session_id: '',
+              message: { role: 'user', content: validationPrompt },
+              parent_tool_use_id: null,
+            };
+          }
+        }
+
+        // Keep generator alive until final result
+        await currentTurnDone;
+      };
+
       const response = query({
-        prompt: prompt,
+        prompt: createPromptStream(),
         options: {
           model: getConfig().model,
           cwd: this.workDir,
@@ -145,9 +217,12 @@ export class AgentExecutor {
         },
       });
 
-      // Process message stream
+      // Process message stream — signal turn completion on result
       for await (const message of response) {
         this.handleMessage(message, toolCalls, collectedOutput, label);
+        if (message.type === 'result') {
+          resolveCurrentTurn();
+        }
       }
 
       const latencyMetrics = this.latencyTracker.finish();
@@ -156,6 +231,8 @@ export class AgentExecutor {
         output: collectedOutput.join('\n'),
         toolCalls,
         latencyMetrics,
+        correctionAttempts,
+        selfCorrected: correctionAttempts > 0,
       };
     } catch (error) {
       const latencyMetrics = this.latencyTracker.finish();
@@ -165,6 +242,8 @@ export class AgentExecutor {
         toolCalls,
         latencyMetrics,
         error: error instanceof Error ? error.message : String(error),
+        correctionAttempts,
+        selfCorrected: false,
       };
     }
   }
diff --git a/tests/evals/cli.ts b/tests/evals/cli.ts
index 757a959..12a4cca 100644
--- a/tests/evals/cli.ts
+++ b/tests/evals/cli.ts
@@ -12,6 +12,7 @@ export interface CliOptions {
   sequential: boolean;
   noDashboard: boolean;
   noFail: boolean;
+  noCorrection: boolean;
   quality: boolean;
   command?: 'run' | 'history' | 'compare' | 'diff' | 'prune' | 'logs' | 'show';
   compareIds?: [string, string];
@@ -61,6 +62,7 @@ export function parseArgs(args: string[]): CliOptions {
     sequential: false,
     noDashboard: false,
     noFail: false,
+    noCorrection: false,
     quality: false,
   };
 
@@ -144,6 +146,8 @@ export function parseArgs(args: string[]): CliOptions {
       options.noDashboard = true;
     } else if (arg === '--no-fail') {
       options.noFail = true;
+    } else if (arg === '--no-correction') {
+      options.noCorrection = true;
     } else if (arg === '--quality' || arg === '-q') {
       options.quality = true;
     }
@@ -193,6 +197,8 @@ Options:
 
   --no-fail           Exit 0 even if success criteria thresholds not met
 
+  --no-correction     Disable within-session agent self-correction retries
+
   --quality, -q       Enable LLM-based quality grading (adds cost/time)
 
   --json              Output results as JSON (for scripting)
diff --git a/tests/evals/index.ts b/tests/evals/index.ts
index 7e92274..118f3a2 100644
--- a/tests/evals/index.ts
+++ b/tests/evals/index.ts
@@ -60,6 +60,7 @@ async function main() {
           noDashboard: options.noDashboard,
           debug: options.debug,
           noFail: options.noFail,
+          noCorrection: options.noCorrection,
           quality: options.quality,
         });
 
diff --git a/tests/evals/parallel-runner.ts b/tests/evals/parallel-runner.ts
index 4bf2f35..2383db7 100644
--- a/tests/evals/parallel-runner.ts
+++ b/tests/evals/parallel-runner.ts
@@ -18,6 +18,7 @@ interface ParallelRunnerOptions {
   keep?: boolean;
   keepOnFail?: boolean;
   concurrency?: number; // Override auto-detection
+  noCorrection?: boolean;
 }
 
 export class ParallelRunner {
@@ -125,7 +126,9 @@ export class ParallelRunner {
           verbose: this.options.verbose,
           scenarioName,
         });
-        const agentResult = await executor.run();
+        const agentResult = await executor.run(
+          this.options.noCorrection ? { enabled: false, maxRetries: 0 } : undefined,
+        );
         lastToolCalls = agentResult.toolCalls;
 
         const grader = new scenario.grader(workDir);
@@ -143,6 +146,8 @@ export class ParallelRunner {
           attempts: attempt,
           latencyMetrics: agentResult.latencyMetrics,
           keyFiles,
+          correctionAttempts: agentResult.correctionAttempts,
+          selfCorrected: agentResult.selfCorrected,
         };
 
         if (gradeResult.passed) {
diff --git a/tests/evals/reporter.ts b/tests/evals/reporter.ts
index 4cd1b4f..297641e 100644
--- a/tests/evals/reporter.ts
+++ b/tests/evals/reporter.ts
@@ -59,7 +59,8 @@ export function printMatrix(results: EvalResult[]): void {
   const passed = results.filter((r) => r.passed).length;
   const total = results.length;
   const rate = ((passed / total) * 100).toFixed(1);
-  console.log(`\nResults: ${passed}/${total} passed (${rate}%)`);
+  const selfCorrected = results.filter((r) => r.selfCorrected).length;
+  console.log(`\nResults: ${passed}/${total} passed (${rate}%)${selfCorrected > 0 ? `, ${selfCorrected} self-corrected` : ''}`);
 
   if (passed < total) {
     console.log('\nFailed scenarios:');
diff --git a/tests/evals/runner.ts b/tests/evals/runner.ts
index c48db6d..2cadba1 100644
--- a/tests/evals/runner.ts
+++ b/tests/evals/runner.ts
@@ -98,6 +98,7 @@ export interface ExtendedEvalOptions extends EvalOptions {
   noDashboard?: boolean;
   debug?: boolean;
   noFail?: boolean;
+  noCorrection?: boolean;
   quality?: boolean;
 }
 
@@ -122,6 +123,7 @@ export async function runEvals(options: ExtendedEvalOptions): Promise<EvalResult
     keep: options.keep,
     keepOnFail: options.keepOnFail,
     concurrency: options.sequential ? 1 : undefined,
+    noCorrection: options.noCorrection,
   });
 
   // Initialize log writer
@@ -302,10 +304,13 @@ function printValidationSummary(validation: ValidationResult): void {
     }
   }
   console.log(
-    `\nFirst-attempt: ${(validation.actual.firstAttemptPassRate * 100).toFixed(1)}% (required: ${validation.criteria.firstAttemptPassRate * 100}%)`,
+    `\nFirst-attempt:    ${(validation.actual.firstAttemptPassRate * 100).toFixed(1)}% (required: ${validation.criteria.firstAttemptPassRate * 100}%)`,
   );
   console.log(
-    `With-retry:    ${(validation.actual.withRetryPassRate * 100).toFixed(1)}% (required: ${validation.criteria.withRetryPassRate * 100}%)`,
+    `With-correction:  ${(validation.actual.withCorrectionPassRate * 100).toFixed(1)}%${validation.criteria.withCorrectionPassRate !== undefined ? ` (required: ${validation.criteria.withCorrectionPassRate * 100}%)` : ''}`,
+  );
+  console.log(
+    `With-retry:       ${(validation.actual.withRetryPassRate * 100).toFixed(1)}% (required: ${validation.criteria.withRetryPassRate * 100}%)`,
   );
   console.log('═'.repeat(50));
 }
diff --git a/tests/evals/success-criteria.ts b/tests/evals/success-criteria.ts
index f6b3cb7..e25752b 100644
--- a/tests/evals/success-criteria.ts
+++ b/tests/evals/success-criteria.ts
@@ -7,7 +7,9 @@ import type { EvalResult } from './types.js';
 export interface SuccessCriteria {
   /** Minimum pass rate on first attempt (0-1) */
   firstAttemptPassRate: number;
-  /** Minimum pass rate with retries (0-1) */
+  /** Minimum pass rate after within-session correction (0-1) */
+  withCorrectionPassRate?: number;
+  /** Minimum pass rate with full scenario retries (0-1) */
   withRetryPassRate: number;
   /** Maximum duration per scenario in ms (optional, for future use) */
   maxDurationMs?: number;
@@ -24,6 +26,7 @@ export interface ValidationResult {
   criteria: SuccessCriteria;
   actual: {
     firstAttemptPassRate: number;
+    withCorrectionPassRate: number;
     withRetryPassRate: number;
   };
   failures: string[];
@@ -34,10 +37,16 @@ export interface ValidationResult {
  * Returns detailed breakdown of pass/fail status with actionable messages.
  */
 export function validateResults(results: EvalResult[], criteria: SuccessCriteria = DEFAULT_CRITERIA): ValidationResult {
-  const firstAttemptPassed = results.filter((r) => r.attempts === 1 && r.passed).length;
+  // First attempt: passed on first scenario attempt with no corrections
+  const firstAttemptPassed = results.filter(
+    (r) => r.attempts === 1 && r.passed && (r.correctionAttempts ?? 0) === 0,
+  ).length;
+  // With correction: passed on first scenario attempt (may have used within-session correction)
+  const withCorrectionPassed = results.filter((r) => r.attempts === 1 && r.passed).length;
   const totalPassed = results.filter((r) => r.passed).length;
 
   const firstAttemptRate = results.length > 0 ? firstAttemptPassed / results.length : 0;
+  const withCorrectionRate = results.length > 0 ? withCorrectionPassed / results.length : 0;
   const withRetryRate = results.length > 0 ? totalPassed / results.length : 0;
 
   const failures: string[] = [];
@@ -46,6 +55,11 @@ export function validateResults(results: EvalResult[], criteria: SuccessCriteria
       `First-attempt pass rate ${(firstAttemptRate * 100).toFixed(1)}% < ${criteria.firstAttemptPassRate * 100}% required`,
     );
   }
+  if (criteria.withCorrectionPassRate !== undefined && withCorrectionRate < criteria.withCorrectionPassRate) {
+    failures.push(
+      `With-correction pass rate ${(withCorrectionRate * 100).toFixed(1)}% < ${criteria.withCorrectionPassRate * 100}% required`,
+    );
+  }
   if (withRetryRate < criteria.withRetryPassRate) {
     failures.push(
       `With-retry pass rate ${(withRetryRate * 100).toFixed(1)}% < ${criteria.withRetryPassRate * 100}% required`,
@@ -55,7 +69,7 @@ export function validateResults(results: EvalResult[], criteria: SuccessCriteria
   return {
     passed: failures.length === 0,
     criteria,
-    actual: { firstAttemptPassRate: firstAttemptRate, withRetryPassRate: withRetryRate },
+    actual: { firstAttemptPassRate: firstAttemptRate, withCorrectionPassRate: withCorrectionRate, withRetryPassRate: withRetryRate },
     failures,
   };
 }
diff --git a/tests/evals/types.ts b/tests/evals/types.ts
index 3f626d2..9891597 100644
--- a/tests/evals/types.ts
+++ b/tests/evals/types.ts
@@ -27,6 +27,10 @@ export interface EvalResult {
   qualityGrade?: QualityGrade;
   /** Key integration files for quality grading (replaces raw diff) */
   keyFiles?: Map<string, string>;
+  /** Within-session correction attempts (0 = passed first try) */
+  correctionAttempts?: number;
+  /** Agent self-corrected after initial failure */
+  selfCorrected?: boolean;
 }
 
 /** Input for quality grading - structured data instead of raw diff */

From 03984c060b11a7281ab2c4691f17cd41f416e95e Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 08:34:56 -0600
Subject: [PATCH 04/14] refactor: unify eval executor with production runAgent
 path

AgentExecutor now delegates to the production runAgent instead of
reimplementing the retry-aware async generator. Exports AgentRunConfig
so evals can construct it directly, adds onMessage hook for latency
tracking. Includes 13 tests verifying the wiring.
---
 src/lib/agent-interface.ts                   |   8 +-
 tests/evals/__tests__/agent-executor.spec.ts | 256 +++++++++++++++++++
 tests/evals/agent-executor.ts                | 185 ++++++--------
 3 files changed, 336 insertions(+), 113 deletions(-)
 create mode 100644 tests/evals/__tests__/agent-executor.spec.ts

diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts
index 856a29a..5b0018c 100644
--- a/src/lib/agent-interface.ts
+++ b/src/lib/agent-interface.ts
@@ -80,9 +80,10 @@ export interface RetryConfig {
 }
 
 /**
- * Internal configuration object returned by initializeAgent
+ * Configuration object for running the agent.
+ * Built by initializeAgent (production) or constructed directly (evals).
  */
-type AgentRunConfig = {
+export type AgentRunConfig = {
   workingDirectory: string;
   mcpServers: McpServersConfig;
   model: string;
@@ -497,6 +498,7 @@ export async function runAgent(
   },
   emitter?: InstallerEventEmitter,
   retryConfig?: RetryConfig,
+  onMessage?: (message: SDKMessage) => void,
 ): Promise<{ error?: AgentErrorType; errorMessage?: string; retryCount?: number }> {
   const {
     spinnerMessage = 'Setting up WorkOS AuthKit...',
@@ -631,6 +633,8 @@ export async function runAgent(
       if (message.type === 'result') {
         resolveCurrentTurn();
       }
+      // Let callers observe messages (e.g., for latency tracking in evals)
+      try { onMessage?.(message); } catch { /* observer errors are non-critical */ }
     }
 
     const durationMs = Date.now() - startTime;
diff --git a/tests/evals/__tests__/agent-executor.spec.ts b/tests/evals/__tests__/agent-executor.spec.ts
new file mode 100644
index 0000000..2ca23bc
--- /dev/null
+++ b/tests/evals/__tests__/agent-executor.spec.ts
@@ -0,0 +1,256 @@
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { mkdtempSync, writeFileSync, rmSync } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+
+// Hoist mocks so they're available in vi.mock factories
+const { mockRunAgent, mockConfig, mockCredentials } = vi.hoisted(() => ({
+  mockRunAgent: vi.fn(),
+  mockConfig: {
+    model: 'test-model',
+    workos: { clientId: 'client_test', authkitDomain: 'test.workos.com', llmGatewayUrl: 'http://localhost:8000' },
+    telemetry: { enabled: false, eventName: 'test_event' },
+    proxy: { refreshThresholdMs: 300000 },
+    nodeVersion: '20',
+    logging: { debugMode: false },
+    documentation: { workosDocsUrl: 'https://workos.com/docs', dashboardUrl: 'https://dashboard.workos.com', issuesUrl: 'https://github.com' },
+    frameworks: {},
+    legacy: { oauthPort: 3000 },
+    branding: { showAsciiArt: false, asciiArt: '', compactAsciiArt: '', useCompact: false },
+  },
+  mockCredentials: {
+    workosApiKey: 'sk_test_key',
+    workosClientId: 'client_test_id',
+    anthropicApiKey: 'sk-ant-test',
+  },
+}));
+
+// Mock the production runAgent — this is what we're testing the wiring to
+vi.mock('../../../src/lib/agent-interface.js', () => ({
+  runAgent: mockRunAgent,
+}));
+
+// Mock dependencies
+vi.mock('../env-loader.js', () => ({
+  loadCredentials: vi.fn(() => mockCredentials),
+}));
+
+vi.mock('../../../src/lib/env-writer.js', () => ({
+  writeEnvLocal: vi.fn(),
+}));
+
+vi.mock('../../../src/utils/env-parser.js', () => ({
+  parseEnvFile: vi.fn(() => ({})),
+}));
+
+vi.mock('../../../src/lib/settings.js', () => ({
+  getConfig: vi.fn(() => mockConfig),
+}));
+
+vi.mock('../../../src/lib/validation/quick-checks.js', () => ({
+  runQuickChecks: vi.fn(),
+}));
+
+// Mock debug/analytics that agent-interface transitively imports
+vi.mock('../../../src/utils/debug.js', () => ({
+  debug: vi.fn(),
+  logInfo: vi.fn(),
+  logWarn: vi.fn(),
+  logError: vi.fn(),
+  initLogFile: vi.fn(),
+  getLogFilePath: vi.fn(() => null),
+}));
+
+vi.mock('../../../src/utils/analytics.js', () => ({
+  analytics: {
+    capture: vi.fn(),
+    setTag: vi.fn(),
+    shutdown: vi.fn(),
+    llmRequest: vi.fn(),
+    incrementAgentIterations: vi.fn(),
+    toolCalled: vi.fn(),
+  },
+}));
+
+import { AgentExecutor } from '../agent-executor.js';
+import { writeEnvLocal } from '../../../src/lib/env-writer.js';
+
+describe('AgentExecutor', () => {
+  let testDir: string;
+
+  beforeEach(() => {
+    testDir = mkdtempSync(join(tmpdir(), 'agent-executor-test-'));
+    // Create package.json so env writing works
+    writeFileSync(join(testDir, 'package.json'), JSON.stringify({ name: 'test' }));
+    mockRunAgent.mockReset();
+  });
+
+  afterEach(() => {
+    rmSync(testDir, { recursive: true, force: true });
+  });
+
+  it('calls production runAgent with correct AgentRunConfig', async () => {
+    mockRunAgent.mockResolvedValue({ retryCount: 0 });
+
+    const executor = new AgentExecutor(testDir, 'nextjs');
+    await executor.run();
+
+    expect(mockRunAgent).toHaveBeenCalledTimes(1);
+
+    const [agentRunConfig] = mockRunAgent.mock.calls[0];
+    expect(agentRunConfig.workingDirectory).toBe(testDir);
+    expect(agentRunConfig.model).toBe('test-model');
+    expect(agentRunConfig.allowedTools).toContain('Skill');
+    expect(agentRunConfig.allowedTools).toContain('Write');
+    expect(agentRunConfig.mcpServers).toHaveProperty('workos');
+    // Direct mode — no gateway URL
+    expect(agentRunConfig.sdkEnv.ANTHROPIC_API_KEY).toBe('sk-ant-test');
+    expect(agentRunConfig.sdkEnv.ANTHROPIC_BASE_URL).toBeUndefined();
+  });
+
+  it('passes RetryConfig when correction is enabled', async () => {
+    mockRunAgent.mockResolvedValue({ retryCount: 0 });
+
+    const executor = new AgentExecutor(testDir, 'nextjs');
+    await executor.run({ enabled: true, maxRetries: 3 });
+
+    const retryConfig = mockRunAgent.mock.calls[0][5]; // 6th arg
+    expect(retryConfig).toBeDefined();
+    expect(retryConfig.maxRetries).toBe(3);
+    expect(typeof retryConfig.validateAndFormat).toBe('function');
+  });
+
+  it('passes no RetryConfig when correction is disabled', async () => {
+    mockRunAgent.mockResolvedValue({ retryCount: 0 });
+
+    const executor = new AgentExecutor(testDir, 'nextjs');
+    await executor.run({ enabled: false, maxRetries: 0 });
+
+    const retryConfig = mockRunAgent.mock.calls[0][5];
+    expect(retryConfig).toBeUndefined();
+  });
+
+  it('passes InstallerOptions with skipAuth=true', async () => {
+    mockRunAgent.mockResolvedValue({ retryCount: 0 });
+
+    const executor = new AgentExecutor(testDir, 'nextjs');
+    await executor.run();
+
+    const installerOptions = mockRunAgent.mock.calls[0][2]; // 3rd arg
+    expect(installerOptions.skipAuth).toBe(true);
+    expect(installerOptions.installDir).toBe(testDir);
+  });
+
+  it('passes onMessage callback as 7th argument', async () => {
+    mockRunAgent.mockResolvedValue({ retryCount: 0 });
+
+    const executor = new AgentExecutor(testDir, 'nextjs');
+    await executor.run();
+
+    const onMessage = mockRunAgent.mock.calls[0][6]; // 7th arg
+    expect(typeof onMessage).toBe('function');
+  });
+
+  it('maps retryCount=0 to correctionAttempts=0, selfCorrected=false', async () => {
+    mockRunAgent.mockResolvedValue({ retryCount: 0 });
+
+    const executor = new AgentExecutor(testDir, 'nextjs');
+    const result = await executor.run();
+
+    expect(result.success).toBe(true);
+    expect(result.correctionAttempts).toBe(0);
+    expect(result.selfCorrected).toBe(false);
+  });
+
+  it('maps retryCount>0 to selfCorrected=true on success', async () => {
+    mockRunAgent.mockResolvedValue({ retryCount: 2 });
+
+    const executor = new AgentExecutor(testDir, 'nextjs');
+    const result = await executor.run();
+
+    expect(result.success).toBe(true);
+    expect(result.correctionAttempts).toBe(2);
+    expect(result.selfCorrected).toBe(true);
+  });
+
+  it('maps runAgent error result to failed AgentResult', async () => {
+    mockRunAgent.mockResolvedValue({
+      error: 'EXECUTION_ERROR',
+      errorMessage: 'SDK crashed',
+      retryCount: 1,
+    });
+
+    const executor = new AgentExecutor(testDir, 'nextjs');
+    const result = await executor.run();
+
+    expect(result.success).toBe(false);
+    expect(result.error).toBe('SDK crashed');
+    expect(result.correctionAttempts).toBe(1);
+    expect(result.selfCorrected).toBe(false);
+  });
+
+  it('handles runAgent throwing an exception', async () => {
+    mockRunAgent.mockRejectedValue(new Error('Connection refused'));
+
+    const executor = new AgentExecutor(testDir, 'nextjs');
+    const result = await executor.run();
+
+    expect(result.success).toBe(false);
+    expect(result.error).toBe('Connection refused');
+    expect(result.correctionAttempts).toBe(0);
+  });
+
+  it('writes env vars before calling runAgent', async () => {
+    mockRunAgent.mockResolvedValue({ retryCount: 0 });
+
+    const executor = new AgentExecutor(testDir, 'nextjs');
+    await executor.run();
+
+    expect(writeEnvLocal).toHaveBeenCalledWith(testDir, {
+      WORKOS_API_KEY: 'sk_test_key',
+      WORKOS_CLIENT_ID: 'client_test_id',
+    });
+  });
+
+  it('onMessage callback collects text output from assistant messages', async () => {
+    mockRunAgent.mockResolvedValue({ retryCount: 0 });
+
+    const executor = new AgentExecutor(testDir, 'nextjs');
+    await executor.run();
+
+    // Get the onMessage callback and simulate a message
+    const onMessage = mockRunAgent.mock.calls[0][6];
+    onMessage({
+      type: 'assistant',
+      message: {
+        content: [{ type: 'text', text: 'Installing AuthKit...' }],
+      },
+    });
+
+    // Run again to verify output is collected (can't check internal state,
+    // but we can verify it doesn't throw)
+    expect(onMessage).toBeDefined();
+  });
+
+  it('builds prompt with correct skill name for framework', async () => {
+    mockRunAgent.mockResolvedValue({ retryCount: 0 });
+
+    const executor = new AgentExecutor(testDir, 'react-router');
+    await executor.run();
+
+    const prompt = mockRunAgent.mock.calls[0][1]; // 2nd arg
+    expect(prompt).toContain('workos-authkit-react-router');
+    expect(prompt).toContain('react-router');
+  });
+
+  it('defaults to correction enabled with maxRetries=2', async () => {
+    mockRunAgent.mockResolvedValue({ retryCount: 0 });
+
+    const executor = new AgentExecutor(testDir, 'nextjs');
+    await executor.run(); // no retryConfig arg — uses default
+
+    const retryConfig = mockRunAgent.mock.calls[0][5];
+    expect(retryConfig).toBeDefined();
+    expect(retryConfig.maxRetries).toBe(2);
+  });
+});
diff --git a/tests/evals/agent-executor.ts b/tests/evals/agent-executor.ts
index 5a9b135..73c7e98 100644
--- a/tests/evals/agent-executor.ts
+++ b/tests/evals/agent-executor.ts
@@ -1,13 +1,13 @@
-import path from 'node:path';
 import { writeFileSync, existsSync, readFileSync } from 'node:fs';
 import { join } from 'node:path';
-import { fileURLToPath } from 'node:url';
 import { loadCredentials } from './env-loader.js';
 import { writeEnvLocal } from '../../src/lib/env-writer.js';
 import { parseEnvFile } from '../../src/utils/env-parser.js';
 import { getConfig } from '../../src/lib/settings.js';
 import { LatencyTracker } from './latency-tracker.js';
 import { runQuickChecks } from '../../src/lib/validation/quick-checks.js';
+import { runAgent, type AgentRunConfig, type RetryConfig } from '../../src/lib/agent-interface.js';
+import type { InstallerOptions } from '../../src/utils/types.js';
 import type { ToolCall, LatencyMetrics } from './types.js';
 
 export interface AgentResult {
@@ -119,113 +119,78 @@ export class AgentExecutor {
     const skillName = SKILL_NAMES[integration];
     const prompt = this.buildPrompt(skillName);
 
-    // Retry loop coordination
-    let correctionAttempts = 0;
-    const maxRetries = config.enabled ? config.maxRetries : 0;
-    const workDir = this.workDir;
+    // Build SDK environment for direct mode
+    const sdkEnv: Record<string, string | undefined> = {
+      ...process.env,
+      ANTHROPIC_API_KEY: this.credentials.anthropicApiKey,
+      CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: 'true',
+      CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: 'true',
+    };
+    delete sdkEnv.ANTHROPIC_BASE_URL;
+    delete sdkEnv.ANTHROPIC_AUTH_TOKEN;
+
+    // Construct AgentRunConfig directly (bypasses initializeAgent/gateway auth)
+    const agentRunConfig: AgentRunConfig = {
+      workingDirectory: this.workDir,
+      mcpServers: {
+        workos: {
+          command: 'npx',
+          args: ['-y', '@workos/mcp-docs-server'],
+        },
+      },
+      model: getConfig().model,
+      allowedTools: ['Skill', 'Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep', 'WebFetch'],
+      sdkEnv,
+    };
 
-    // Turn completion signals
-    let resolveCurrentTurn!: () => void;
-    let currentTurnDone!: Promise<void>;
+    // Thin InstallerOptions — only what runAgent needs
+    const installerOptions: InstallerOptions = {
+      debug: this.options.verbose ?? false,
+      forceInstall: false,
+      installDir: this.workDir,
+      local: false,
+      ci: true,
+      skipAuth: true,
+    };
 
-    function resetTurnSignal() {
-      currentTurnDone = new Promise<void>((resolve) => {
-        resolveCurrentTurn = resolve;
-      });
-    }
-    resetTurnSignal();
+    // Build production RetryConfig with validateAndFormat callback
+    const prodRetryConfig: RetryConfig | undefined = config.enabled
+      ? {
+          maxRetries: config.maxRetries,
+          validateAndFormat: async (workingDirectory: string): Promise<string | null> => {
+            const quickResult = await runQuickChecks(workingDirectory);
+            return quickResult.passed ? null : quickResult.agentRetryPrompt;
+          },
+        }
+      : undefined;
 
-    // Initialize and run agent
     try {
-      const { query } = await import('@anthropic-ai/claude-agent-sdk');
+      // Delegate to production runAgent — same retry loop, same generator coordination
+      const result = await runAgent(
+        agentRunConfig,
+        prompt,
+        installerOptions,
+        undefined, // no spinner config
+        undefined, // no emitter
+        prodRetryConfig,
+        (message) => this.trackMessage(message, toolCalls, collectedOutput, label),
+      );
 
-      // Build SDK environment for direct mode
-      const sdkEnv: Record<string, string | undefined> = {
-        ...process.env,
-        ANTHROPIC_API_KEY: this.credentials.anthropicApiKey,
-        CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: 'true',
-        CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: 'true',
-      };
-      // Remove gateway config to use direct API
-      delete sdkEnv.ANTHROPIC_BASE_URL;
-      delete sdkEnv.ANTHROPIC_AUTH_TOKEN;
-
-      // Get plugin path for skills
-      const __filename = fileURLToPath(import.meta.url);
-      const __dirname = path.dirname(__filename);
-      const pluginPath = path.join(__dirname, '../..');
-
-      // Retry-aware prompt stream (same pattern as production agent-interface.ts)
-      const createPromptStream = async function* () {
-        yield {
-          type: 'user',
-          session_id: '',
-          message: { role: 'user', content: prompt },
-          parent_tool_use_id: null,
+      const latencyMetrics = this.latencyTracker.finish();
+      const correctionAttempts = result.retryCount ?? 0;
+
+      if (result.error) {
+        return {
+          success: false,
+          output: collectedOutput.join('\n'),
+          toolCalls,
+          latencyMetrics,
+          error: result.errorMessage ?? String(result.error),
+          correctionAttempts,
+          selfCorrected: false,
         };
-
-        if (maxRetries > 0) {
-          while (correctionAttempts < maxRetries) {
-            await currentTurnDone;
-
-            let validationPrompt: string | null;
-            try {
-              const quickResult = await runQuickChecks(workDir);
-              validationPrompt = quickResult.passed ? null : quickResult.agentRetryPrompt;
-            } catch {
-              validationPrompt = null; // treat validation errors as passed
-            }
-
-            if (validationPrompt === null) break;
-
-            correctionAttempts++;
-            if (label && process.env.EVAL_VERBOSE) {
-              console.log(`${label} Correction attempt ${correctionAttempts}/${maxRetries}`);
-            }
-
-            resetTurnSignal();
-
-            yield {
-              type: 'user',
-              session_id: '',
-              message: { role: 'user', content: validationPrompt },
-              parent_tool_use_id: null,
-            };
-          }
-        }
-
-        // Keep generator alive until final result
-        await currentTurnDone;
-      };
-
-      const response = query({
-        prompt: createPromptStream(),
-        options: {
-          model: getConfig().model,
-          cwd: this.workDir,
-          permissionMode: 'acceptEdits',
-          mcpServers: {
-            workos: {
-              command: 'npx',
-              args: ['-y', '@workos/mcp-docs-server'],
-            },
-          },
-          env: sdkEnv,
-          tools: { type: 'preset', preset: 'claude_code' },
-          allowedTools: ['Skill', 'Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep', 'WebFetch'],
-          plugins: [{ type: 'local', path: pluginPath }],
-        },
-      });
-
-      // Process message stream — signal turn completion on result
-      for await (const message of response) {
-        this.handleMessage(message, toolCalls, collectedOutput, label);
-        if (message.type === 'result') {
-          resolveCurrentTurn();
-        }
       }
 
-      const latencyMetrics = this.latencyTracker.finish();
       return {
         success: true,
         output: collectedOutput.join('\n'),
@@ -242,7 +207,7 @@ export class AgentExecutor {
         toolCalls,
         latencyMetrics,
         error: error instanceof Error ? error.message : String(error),
-        correctionAttempts,
+        correctionAttempts: 0,
         selfCorrected: false,
       };
     }
@@ -266,15 +231,17 @@ Use the \`${skillName}\` skill to integrate WorkOS AuthKit into this application
 Begin by invoking the ${skillName} skill.`;
   }
 
-  private handleMessage(message: any, toolCalls: ToolCall[], collectedOutput: string[], label: string): void {
+  /**
+   * Observe SDK messages for latency tracking and output collection.
+   * This is called via the onMessage hook — production handleSDKMessage runs first.
+   */
+  private trackMessage(message: any, toolCalls: ToolCall[], collectedOutput: string[], label: string): void {
     if (message.type === 'assistant') {
-      // End any in-progress tool call when we get a new assistant message
       this.latencyTracker.endToolCall();
 
       const content = message.message?.content;
       if (Array.isArray(content)) {
         for (const block of content) {
-          // Capture text output and track TTFT
           if (block.type === 'text' && typeof block.text === 'string') {
             this.latencyTracker.recordFirstContent();
             collectedOutput.push(block.text);
@@ -282,14 +249,12 @@ Begin by invoking the ${skillName} skill.`;
               console.log(`${label} Agent: ${block.text.slice(0, 100)}...`);
             }
           }
-          // Capture tool calls and start timing
           if (block.type === 'tool_use') {
             this.latencyTracker.startToolCall(block.name);
-            const call: ToolCall = {
+            toolCalls.push({
               tool: block.name,
               input: block.input as Record<string, unknown>,
-            };
-            toolCalls.push(call);
+            });
             if (this.options.verbose) {
               console.log(`${label} Tool: ${block.name}`);
             }
@@ -299,7 +264,6 @@ Begin by invoking the ${skillName} skill.`;
     }
 
     if (message.type === 'result') {
-      // Capture token usage from result
       if (message.usage) {
         this.latencyTracker.recordTokens(message.usage.input_tokens ?? 0, message.usage.output_tokens ?? 0);
       }
@@ -310,7 +274,6 @@ Begin by invoking the ${skillName} skill.`;
   }
 
   private getIntegration(): string {
-    // Integration is now a string type — framework name IS the integration name
     return this.framework;
   }
 }

From 81a374e39ec3d5d6132ee5bcf78d9a3e00df3380 Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 08:41:09 -0600
Subject: [PATCH 05/14] fix: recalibrate success criteria thresholds for
 correction-aware metrics

First-attempt now means zero corrections, which is stricter than before.
Lower threshold to 30% (aspirational), add withCorrectionPassRate at 90%
as the primary quality gate, keep withRetryPassRate at 95%.
---
 tests/evals/success-criteria.spec.ts | 69 ++++++++++++++++++----------
 tests/evals/success-criteria.ts      |  3 +-
 2 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/tests/evals/success-criteria.spec.ts b/tests/evals/success-criteria.spec.ts
index ba62604..24fc969 100644
--- a/tests/evals/success-criteria.spec.ts
+++ b/tests/evals/success-criteria.spec.ts
@@ -2,30 +2,35 @@ import { describe, it, expect } from 'vitest';
 import { validateResults, DEFAULT_CRITERIA, type SuccessCriteria } from './success-criteria.js';
 import type { EvalResult } from './types.js';
 
-function makeResult(passed: boolean, attempts: number = 1): EvalResult {
+function makeResult(passed: boolean, attempts: number = 1, correctionAttempts: number = 0): EvalResult {
   return {
     scenario: `test-${Math.random().toString(36).slice(2)}`,
     passed,
     duration: 1000,
     attempts,
+    correctionAttempts,
   };
 }
 
 describe('success-criteria', () => {
   describe('DEFAULT_CRITERIA', () => {
     it('has expected default thresholds', () => {
-      expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.9);
+      expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.3);
+      expect(DEFAULT_CRITERIA.withCorrectionPassRate).toBe(0.9);
       expect(DEFAULT_CRITERIA.withRetryPassRate).toBe(0.95);
     });
   });
 
   describe('validateResults', () => {
     it('returns passed=true when all criteria met', () => {
-      // 10 results, 9 passed on first attempt, 1 passed on retry
+      // 10 results: 4 clean (40% > 30%), 5 corrected (9/10 = 90% correction), 1 retried (100% retry)
       const results: EvalResult[] = [
-        ...Array(9)
+        ...Array(4)
           .fill(null)
-          .map(() => makeResult(true, 1)),
+          .map(() => makeResult(true, 1, 0)),
+        ...Array(5)
+          .fill(null)
+          .map(() => makeResult(true, 1, 1)),
         makeResult(true, 2),
       ];
 
@@ -33,34 +38,38 @@ describe('success-criteria', () => {
 
       expect(validation.passed).toBe(true);
       expect(validation.failures).toHaveLength(0);
-      expect(validation.actual.firstAttemptPassRate).toBe(0.9);
+      expect(validation.actual.firstAttemptPassRate).toBe(0.4);
+      expect(validation.actual.withCorrectionPassRate).toBe(0.9);
       expect(validation.actual.withRetryPassRate).toBe(1);
     });
 
     it('returns passed=false when first-attempt rate below threshold', () => {
-      // 10 results, only 8 passed on first attempt
+      // 10 results, only 2 passed on first attempt (20% < 30% threshold)
       const results: EvalResult[] = [
-        ...Array(8)
+        ...Array(2)
           .fill(null)
           .map(() => makeResult(true, 1)),
-        makeResult(true, 2),
+        ...Array(7)
+          .fill(null)
+          .map(() => makeResult(true, 2)),
         makeResult(true, 2),
       ];
 
       const validation = validateResults(results);
 
       expect(validation.passed).toBe(false);
-      expect(validation.failures).toHaveLength(1);
-      expect(validation.failures[0]).toContain('First-attempt');
-      expect(validation.failures[0]).toContain('80.0%');
+      expect(validation.failures.some((f) => f.includes('First-attempt'))).toBe(true);
     });
 
     it('returns passed=false when with-retry rate below threshold', () => {
-      // 10 results, 9 passed first attempt, 1 failed entirely
+      // 10 results: 4 clean, 5 corrected (90% correction), 1 failed → 90% retry < 95%
       const results: EvalResult[] = [
-        ...Array(9)
+        ...Array(4)
           .fill(null)
-          .map(() => makeResult(true, 1)),
+          .map(() => makeResult(true, 1, 0)),
+        ...Array(5)
+          .fill(null)
+          .map(() => makeResult(true, 1, 1)),
         makeResult(false, 3),
       ];
 
@@ -71,21 +80,24 @@ describe('success-criteria', () => {
       expect(validation.failures[0]).toContain('With-retry');
     });
 
-    it('returns both failures when both criteria not met', () => {
-      // 10 results, 7 passed first attempt, 1 failed
+    it('returns both failures when multiple criteria not met', () => {
+      // 10 results, 2 passed first attempt (20% < 30%), 4 failed entirely (60% < 95% retry)
       const results: EvalResult[] = [
-        ...Array(7)
+        ...Array(2)
           .fill(null)
           .map(() => makeResult(true, 1)),
-        makeResult(true, 2),
-        makeResult(true, 2),
-        makeResult(false, 3),
+        ...Array(4)
+          .fill(null)
+          .map(() => makeResult(true, 2)),
+        ...Array(4)
+          .fill(null)
+          .map(() => makeResult(false, 3)),
       ];
 
       const validation = validateResults(results);
 
       expect(validation.passed).toBe(false);
-      expect(validation.failures).toHaveLength(2);
+      expect(validation.failures.length).toBeGreaterThanOrEqual(2);
     });
 
     it('handles empty results array', () => {
@@ -120,11 +132,18 @@ describe('success-criteria', () => {
     });
 
     it('passes when exactly at threshold', () => {
-      // Exactly 90% first-attempt, 95% with-retry
+      // 20 results:
+      //   6 clean first-attempt (attempt=1, corrections=0) → 30% first-attempt
+      //  12 self-corrected (attempt=1, corrections=1)       → 18/20 = 90% with-correction
+      //   1 passed on scenario retry (attempt=2)             → 19/20 = 95% with-retry
+      //   1 failed (attempt=3)
       const results: EvalResult[] = [
-        ...Array(18)
+        ...Array(6)
           .fill(null)
-          .map(() => makeResult(true, 1)),
+          .map(() => makeResult(true, 1, 0)),
+        ...Array(12)
+          .fill(null)
+          .map(() => makeResult(true, 1, 1)),
         makeResult(true, 2),
         makeResult(false, 3),
       ];
diff --git a/tests/evals/success-criteria.ts b/tests/evals/success-criteria.ts
index e25752b..57f1ed4 100644
--- a/tests/evals/success-criteria.ts
+++ b/tests/evals/success-criteria.ts
@@ -17,7 +17,8 @@ export interface SuccessCriteria {
 
 /** Default thresholds for CI enforcement */
 export const DEFAULT_CRITERIA: SuccessCriteria = {
-  firstAttemptPassRate: 0.9,
+  firstAttemptPassRate: 0.3,
+  withCorrectionPassRate: 0.9,
   withRetryPassRate: 0.95,
 };
 

From 61ee472ebdf9343565898c6a5dbf11f449023cb3 Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 08:42:12 -0600
Subject: [PATCH 06/14] chore: disable dotnet eval scenario (broken SDK, no
 runtime)

---
 tests/evals/runner.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/evals/runner.ts b/tests/evals/runner.ts
index 2cadba1..5375ea3 100644
--- a/tests/evals/runner.ts
+++ b/tests/evals/runner.ts
@@ -87,8 +87,8 @@ const SCENARIOS: Scenario[] = [
   { framework: 'elixir', state: 'example', grader: ElixirGrader },
   { framework: 'elixir', state: 'example-auth0', grader: ElixirGrader },
 
-  // .NET (broken — no runtime)
-  { framework: 'dotnet', state: 'example', grader: DotnetGrader },
+  // .NET (disabled — SDK is broken and no runtime available on most machines)
+  // { framework: 'dotnet', state: 'example', grader: DotnetGrader },
 ];
 
 export interface ExtendedEvalOptions extends EvalOptions {

From f891dfeb5efea20c2cd6c3d6c96c5f8dd1573437 Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 08:53:49 -0600
Subject: [PATCH 07/14] fix: lower first-attempt threshold to 20% to match
 observed baseline

Two eval runs show ~21-27% first-attempt rate. The correction loop
consistently brings it to 93-100%. Set threshold at 20% to catch
regressions without failing on normal variance.
---
 tests/evals/success-criteria.spec.ts | 12 +++++-------
 tests/evals/success-criteria.ts      |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/tests/evals/success-criteria.spec.ts b/tests/evals/success-criteria.spec.ts
index 24fc969..7a8dae9 100644
--- a/tests/evals/success-criteria.spec.ts
+++ b/tests/evals/success-criteria.spec.ts
@@ -15,7 +15,7 @@ function makeResult(passed: boolean, attempts: number = 1, correctionAttempts: n
 describe('success-criteria', () => {
   describe('DEFAULT_CRITERIA', () => {
     it('has expected default thresholds', () => {
-      expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.3);
+      expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.2);
       expect(DEFAULT_CRITERIA.withCorrectionPassRate).toBe(0.9);
       expect(DEFAULT_CRITERIA.withRetryPassRate).toBe(0.95);
     });
@@ -44,14 +44,12 @@ describe('success-criteria', () => {
     });
 
     it('returns passed=false when first-attempt rate below threshold', () => {
-      // 10 results, only 2 passed on first attempt (20% < 30% threshold)
+      // 10 results, only 1 passed on first attempt (10% < 20% threshold)
       const results: EvalResult[] = [
-        ...Array(2)
-          .fill(null)
-          .map(() => makeResult(true, 1)),
-        ...Array(7)
+        makeResult(true, 1, 0),
+        ...Array(8)
           .fill(null)
-          .map(() => makeResult(true, 2)),
+          .map(() => makeResult(true, 1, 1)),
         makeResult(true, 2),
       ];
 
diff --git a/tests/evals/success-criteria.ts b/tests/evals/success-criteria.ts
index 57f1ed4..110ce51 100644
--- a/tests/evals/success-criteria.ts
+++ b/tests/evals/success-criteria.ts
@@ -17,7 +17,7 @@ export interface SuccessCriteria {
 
 /** Default thresholds for CI enforcement */
 export const DEFAULT_CRITERIA: SuccessCriteria = {
-  firstAttemptPassRate: 0.3,
+  firstAttemptPassRate: 0.2,
   withCorrectionPassRate: 0.9,
   withRetryPassRate: 0.95,
 };

From 46f33bfddf01c2f30e0fc05e734de115db5e08c8 Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 08:56:39 -0600
Subject: [PATCH 08/14] fix: skip typecheck on non-TypeScript projects, raise
 first-attempt threshold
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

detectTypecheckCommand was falling back to npx tsc --noEmit for every
project including Python, Ruby, Go, etc. Now checks for tsconfig.json
before falling back — no tsconfig means skip typecheck entirely. This
eliminates false correction triggers on non-JS frameworks.

Raises first-attempt threshold to 50% since the false positives were
the main driver of the low rate.
---
 src/lib/validation/quick-checks.spec.ts | 17 ++++++++++-
 src/lib/validation/quick-checks.ts      | 10 +++++--
 tests/evals/success-criteria.spec.ts    | 38 +++++++++++++------------
 tests/evals/success-criteria.ts         |  2 +-
 4 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/src/lib/validation/quick-checks.spec.ts b/src/lib/validation/quick-checks.spec.ts
index 888c2b9..4b4cf8c 100644
--- a/src/lib/validation/quick-checks.spec.ts
+++ b/src/lib/validation/quick-checks.spec.ts
@@ -230,11 +230,12 @@ describe('runTypecheckValidation', () => {
     );
   });
 
-  it('falls back to npx tsc --noEmit when no typecheck script', async () => {
+  it('falls back to npx tsc --noEmit when no typecheck script but tsconfig exists', async () => {
     writeFileSync(
       join(testDir, 'package.json'),
       JSON.stringify({ scripts: { build: 'next build' } }),
     );
+    writeFileSync(join(testDir, 'tsconfig.json'), '{}');
     mockSpawn.mockImplementationOnce(() => createMockProcess(0));
 
     await runTypecheckValidation(testDir);
@@ -246,6 +247,20 @@ describe('runTypecheckValidation', () => {
     );
   });
 
+  it('skips typecheck when no tsconfig.json and no typecheck script', async () => {
+    writeFileSync(
+      join(testDir, 'package.json'),
+      JSON.stringify({ scripts: { build: 'go build' } }),
+    );
+    // No tsconfig.json — not a TypeScript project
+
+    const result = await runTypecheckValidation(testDir);
+
+    expect(result.passed).toBe(true);
+    expect(result.issues).toHaveLength(0);
+    expect(mockSpawn).not.toHaveBeenCalled();
+  });
+
   it('detects type-check script (hyphenated variant)', async () => {
     writeFileSync(
       join(testDir, 'package.json'),
diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts
index 1a0b3a2..128eee7 100644
--- a/src/lib/validation/quick-checks.ts
+++ b/src/lib/validation/quick-checks.ts
@@ -155,8 +155,14 @@ async function detectTypecheckCommand(projectDir: string): Promise<TypecheckComm
     // No package.json or malformed — continue detection
   }
 
-  // Fallback: use npx tsc --noEmit
-  return { command: 'npx', args: ['tsc', '--noEmit'] };
+  // Only fall back to tsc if the project actually uses TypeScript
+  try {
+    await readFile(join(projectDir, 'tsconfig.json'), 'utf-8');
+    return { command: 'npx', args: ['tsc', '--noEmit'] };
+  } catch {
+    // No tsconfig.json — not a TypeScript project, skip typecheck
+    return null;
+  }
 }
 
 /**
diff --git a/tests/evals/success-criteria.spec.ts b/tests/evals/success-criteria.spec.ts
index 7a8dae9..ce746bf 100644
--- a/tests/evals/success-criteria.spec.ts
+++ b/tests/evals/success-criteria.spec.ts
@@ -15,7 +15,7 @@ function makeResult(passed: boolean, attempts: number = 1, correctionAttempts: n
 describe('success-criteria', () => {
   describe('DEFAULT_CRITERIA', () => {
     it('has expected default thresholds', () => {
-      expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.2);
+      expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.5);
       expect(DEFAULT_CRITERIA.withCorrectionPassRate).toBe(0.9);
       expect(DEFAULT_CRITERIA.withRetryPassRate).toBe(0.95);
     });
@@ -23,12 +23,12 @@ describe('success-criteria', () => {
 
   describe('validateResults', () => {
     it('returns passed=true when all criteria met', () => {
-      // 10 results: 4 clean (40% > 30%), 5 corrected (9/10 = 90% correction), 1 retried (100% retry)
+      // 10 results: 6 clean (60% > 50%), 3 corrected (9/10 = 90% correction), 1 retried (100% retry)
       const results: EvalResult[] = [
-        ...Array(4)
+        ...Array(6)
           .fill(null)
           .map(() => makeResult(true, 1, 0)),
-        ...Array(5)
+        ...Array(3)
           .fill(null)
           .map(() => makeResult(true, 1, 1)),
         makeResult(true, 2),
@@ -38,16 +38,18 @@ describe('success-criteria', () => {
 
       expect(validation.passed).toBe(true);
       expect(validation.failures).toHaveLength(0);
-      expect(validation.actual.firstAttemptPassRate).toBe(0.4);
+      expect(validation.actual.firstAttemptPassRate).toBe(0.6);
       expect(validation.actual.withCorrectionPassRate).toBe(0.9);
       expect(validation.actual.withRetryPassRate).toBe(1);
     });
 
     it('returns passed=false when first-attempt rate below threshold', () => {
-      // 10 results, only 1 passed on first attempt (10% < 20% threshold)
+      // 10 results: 4 clean (40% < 50%), 5 corrected (90% correction), 1 retried
       const results: EvalResult[] = [
-        makeResult(true, 1, 0),
-        ...Array(8)
+        ...Array(4)
+          .fill(null)
+          .map(() => makeResult(true, 1, 0)),
+        ...Array(5)
           .fill(null)
           .map(() => makeResult(true, 1, 1)),
         makeResult(true, 2),
@@ -60,12 +62,12 @@ describe('success-criteria', () => {
     });
 
     it('returns passed=false when with-retry rate below threshold', () => {
-      // 10 results: 4 clean, 5 corrected (90% correction), 1 failed → 90% retry < 95%
+      // 10 results: 6 clean (60%), 3 corrected (90% correction), 1 failed → 90% retry < 95%
       const results: EvalResult[] = [
-        ...Array(4)
+        ...Array(6)
           .fill(null)
           .map(() => makeResult(true, 1, 0)),
-        ...Array(5)
+        ...Array(3)
           .fill(null)
           .map(() => makeResult(true, 1, 1)),
         makeResult(false, 3),
@@ -79,14 +81,14 @@ describe('success-criteria', () => {
     });
 
     it('returns both failures when multiple criteria not met', () => {
-      // 10 results, 2 passed first attempt (20% < 30%), 4 failed entirely (60% < 95% retry)
+      // 10 results: 2 clean (20% < 50%), 4 corrected, 4 failed (60% < 95% retry)
       const results: EvalResult[] = [
         ...Array(2)
           .fill(null)
-          .map(() => makeResult(true, 1)),
+          .map(() => makeResult(true, 1, 0)),
         ...Array(4)
           .fill(null)
-          .map(() => makeResult(true, 2)),
+          .map(() => makeResult(true, 1, 1)),
         ...Array(4)
           .fill(null)
           .map(() => makeResult(false, 3)),
@@ -131,15 +133,15 @@ describe('success-criteria', () => {
 
     it('passes when exactly at threshold', () => {
       // 20 results:
-      //   6 clean first-attempt (attempt=1, corrections=0) → 30% first-attempt
-      //  12 self-corrected (attempt=1, corrections=1)       → 18/20 = 90% with-correction
+      //  10 clean first-attempt (attempt=1, corrections=0) → 50% first-attempt
+      //   8 self-corrected (attempt=1, corrections=1)       → 18/20 = 90% with-correction
       //   1 passed on scenario retry (attempt=2)             → 19/20 = 95% with-retry
       //   1 failed (attempt=3)
       const results: EvalResult[] = [
-        ...Array(6)
+        ...Array(10)
           .fill(null)
           .map(() => makeResult(true, 1, 0)),
-        ...Array(12)
+        ...Array(8)
           .fill(null)
           .map(() => makeResult(true, 1, 1)),
         makeResult(true, 2),
diff --git a/tests/evals/success-criteria.ts b/tests/evals/success-criteria.ts
index 110ce51..3373397 100644
--- a/tests/evals/success-criteria.ts
+++ b/tests/evals/success-criteria.ts
@@ -17,7 +17,7 @@ export interface SuccessCriteria {
 
 /** Default thresholds for CI enforcement */
 export const DEFAULT_CRITERIA: SuccessCriteria = {
-  firstAttemptPassRate: 0.2,
+  firstAttemptPassRate: 0.5,
   withCorrectionPassRate: 0.9,
   withRetryPassRate: 0.95,
 };

From 807116c14d80131e410abc0691270020c24b0b16 Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 11:35:31 -0600
Subject: [PATCH 09/14] feat: detect build systems beyond package.json for
 multi-language support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend quick-checks to auto-detect Go (go.mod), Elixir (mix.exs),
.NET (*.csproj), and Kotlin/Java (build.gradle) build commands from
project files. Interpreted languages (Python, Ruby, PHP) pass through
silently — no universal build command exists for them.
---
 src/lib/validation/build-validator.spec.ts | 116 +++++++++++++++++++++
 src/lib/validation/build-validator.ts      |  50 ++++++++-
 src/lib/validation/quick-checks.spec.ts    |  20 ++++
 src/lib/validation/quick-checks.ts         |  60 +++++++++--
 4 files changed, 238 insertions(+), 8 deletions(-)
 create mode 100644 src/lib/validation/build-validator.spec.ts

diff --git a/src/lib/validation/build-validator.spec.ts b/src/lib/validation/build-validator.spec.ts
new file mode 100644
index 0000000..41273b7
--- /dev/null
+++ b/src/lib/validation/build-validator.spec.ts
@@ -0,0 +1,116 @@
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { mkdtempSync, writeFileSync, rmSync } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+import { detectBuildCommand } from './build-validator.js';
+
+describe('detectBuildCommand', () => {
+  let testDir: string;
+
+  beforeEach(() => {
+    testDir = mkdtempSync(join(tmpdir(), 'build-detect-test-'));
+  });
+
+  afterEach(() => {
+    rmSync(testDir, { recursive: true, force: true });
+  });
+
+  it('detects package.json with build script (pnpm)', async () => {
+    writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'next build' } }));
+    writeFileSync(join(testDir, 'pnpm-lock.yaml'), '');
+
+    const cmd = await detectBuildCommand(testDir);
+
+    expect(cmd).toEqual({ command: 'pnpm', args: ['build'] });
+  });
+
+  it('detects package.json with build script (npm)', async () => {
+    writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'react-scripts build' } }));
+
+    const cmd = await detectBuildCommand(testDir);
+
+    expect(cmd).toEqual({ command: 'npm', args: ['run', 'build'] });
+  });
+
+  it('skips package.json without build script', async () => {
+    writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { start: 'node index.js' } }));
+
+    const cmd = await detectBuildCommand(testDir);
+
+    expect(cmd).toBeNull();
+  });
+
+  it('detects go.mod → go build', async () => {
+    writeFileSync(join(testDir, 'go.mod'), 'module example.com/app\n\ngo 1.21\n');
+
+    const cmd = await detectBuildCommand(testDir);
+
+    expect(cmd).toEqual({ command: 'go', args: ['build', './...'] });
+  });
+
+  it('detects mix.exs → mix compile', async () => {
+    writeFileSync(join(testDir, 'mix.exs'), 'defmodule MyApp.MixProject do\nend\n');
+
+    const cmd = await detectBuildCommand(testDir);
+
+    expect(cmd).toEqual({ command: 'mix', args: ['compile'] });
+  });
+
+  it('detects *.csproj → dotnet build', async () => {
+    writeFileSync(join(testDir, 'MyApp.csproj'), '<Project Sdk="Microsoft.NET.Sdk">\n</Project>\n');
+
+    const cmd = await detectBuildCommand(testDir);
+
+    expect(cmd).toEqual({ command: 'dotnet', args: ['build'] });
+  });
+
+  it('detects build.gradle.kts with gradlew → ./gradlew build', async () => {
+    writeFileSync(join(testDir, 'build.gradle.kts'), 'plugins { kotlin("jvm") }\n');
+    writeFileSync(join(testDir, 'gradlew'), '#!/bin/sh\nexec gradle "$@"\n');
+
+    const cmd = await detectBuildCommand(testDir);
+
+    expect(cmd).toEqual({ command: './gradlew', args: ['build'] });
+  });
+
+  it('detects build.gradle without gradlew → gradle build', async () => {
+    writeFileSync(join(testDir, 'build.gradle'), 'apply plugin: "java"\n');
+
+    const cmd = await detectBuildCommand(testDir);
+
+    expect(cmd).toEqual({ command: 'gradle', args: ['build'] });
+  });
+
+  it('returns null for empty directory', async () => {
+    const cmd = await detectBuildCommand(testDir);
+
+    expect(cmd).toBeNull();
+  });
+
+  it('returns null for Python project (no universal build)', async () => {
+    writeFileSync(join(testDir, 'pyproject.toml'), '[project]\nname = "myapp"\n');
+    writeFileSync(join(testDir, 'app.py'), 'print("hello")\n');
+
+    const cmd = await detectBuildCommand(testDir);
+
+    expect(cmd).toBeNull();
+  });
+
+  it('returns null for Ruby project (no universal build)', async () => {
+    writeFileSync(join(testDir, 'Gemfile'), 'source "https://rubygems.org"\ngem "rails"\n');
+
+    const cmd = await detectBuildCommand(testDir);
+
+    expect(cmd).toBeNull();
+  });
+
+  it('package.json build script takes priority over go.mod', async () => {
+    writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'tsc' } }));
+    writeFileSync(join(testDir, 'go.mod'), 'module example.com/app\n');
+    writeFileSync(join(testDir, 'pnpm-lock.yaml'), '');
+
+    const cmd = await detectBuildCommand(testDir);
+
+    expect(cmd).toEqual({ command: 'pnpm', args: ['build'] });
+  });
+});
diff --git a/src/lib/validation/build-validator.ts b/src/lib/validation/build-validator.ts
index 2e5fc9f..52836c5 100644
--- a/src/lib/validation/build-validator.ts
+++ b/src/lib/validation/build-validator.ts
@@ -1,5 +1,5 @@
 import { spawn } from 'child_process';
-import { existsSync } from 'fs';
+import { existsSync, readdirSync } from 'fs';
 import { readFile } from 'fs/promises';
 import { join } from 'path';
 import type { ValidationIssue } from './types.js';
@@ -115,6 +115,54 @@ export async function hasBuildScriptInPackageJson(projectDir: string): Promise<b
   }
 }
 
+export interface BuildCommand {
+  command: string;
+  args: string[];
+}
+
+/**
+ * Detect the build command for a project by checking ecosystem markers.
+ * Returns null if no build system detected — caller should skip build validation.
+ */
+export async function detectBuildCommand(projectDir: string): Promise<BuildCommand | null> {
+  // 1. package.json with build script (JS/TS frameworks)
+  const pm = detectPackageManager(projectDir);
+  if (await hasBuildScriptInPackageJson(projectDir)) {
+    const args = pm === 'npm' ? ['run', 'build'] : ['build'];
+    return { command: pm, args };
+  }
+
+  // 2. Go (go.mod → go build ./...)
+  if (existsSync(join(projectDir, 'go.mod'))) {
+    return { command: 'go', args: ['build', './...'] };
+  }
+
+  // 3. Elixir (mix.exs → mix compile)
+  if (existsSync(join(projectDir, 'mix.exs'))) {
+    return { command: 'mix', args: ['compile'] };
+  }
+
+  // 4. .NET (*.csproj → dotnet build)
+  try {
+    const files = readdirSync(projectDir);
+    if (files.some((f) => f.endsWith('.csproj'))) {
+      return { command: 'dotnet', args: ['build'] };
+    }
+  } catch {
+    // Can't read directory — skip
+  }
+
+  // 5. Kotlin/Java (build.gradle.kts or build.gradle → gradlew/gradle build)
+  if (existsSync(join(projectDir, 'build.gradle.kts')) || existsSync(join(projectDir, 'build.gradle'))) {
+    const gradlew = existsSync(join(projectDir, 'gradlew')) ? './gradlew' : 'gradle';
+    return { command: gradlew, args: ['build'] };
+  }
+
+  // Interpreted languages (Python, Ruby, PHP) have no universal build command.
+  // Return null — quick-checks will skip the build step silently.
+  return null;
+}
+
 export function parseBuildErrors(output: string): string[] {
   const errors: string[] = [];
 
diff --git a/src/lib/validation/quick-checks.spec.ts b/src/lib/validation/quick-checks.spec.ts
index 4b4cf8c..190221f 100644
--- a/src/lib/validation/quick-checks.spec.ts
+++ b/src/lib/validation/quick-checks.spec.ts
@@ -138,6 +138,26 @@ describe('runQuickChecks', () => {
     expect(result.results[1].phase).toBe('build');
     expect(result.agentRetryPrompt).toContain('build failed');
   });
+
+  it('skips build when no build system detected (e.g., Python project)', async () => {
+    // Rewrite testDir without a build script or any build system markers
+    writeFileSync(
+      join(testDir, 'package.json'),
+      JSON.stringify({ scripts: { typecheck: 'tsc --noEmit' } }),
+    );
+
+    mockSpawn.mockImplementationOnce(() => createMockProcess(0)); // typecheck pass only
+
+    const result = await runQuickChecks(testDir);
+
+    expect(result.passed).toBe(true);
+    expect(result.results).toHaveLength(2);
+    expect(result.results[0].phase).toBe('typecheck');
+    expect(result.results[1].phase).toBe('build');
+    expect(result.results[1].passed).toBe(true); // passed through silently
+    // Only one spawn call (typecheck) — no spawn for build
+    expect(mockSpawn).toHaveBeenCalledTimes(1);
+  });
 });
 
 describe('runTypecheckValidation', () => {
diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts
index 128eee7..6db3132 100644
--- a/src/lib/validation/quick-checks.ts
+++ b/src/lib/validation/quick-checks.ts
@@ -2,7 +2,7 @@ import { spawn } from 'child_process';
 import { readFile } from 'fs/promises';
 import { join } from 'path';
 import type { QuickCheckResult, QuickChecksOutput, ValidationIssue } from './types.js';
-import { detectPackageManager, parseBuildErrors, runBuildValidation } from './build-validator.js';
+import { detectBuildCommand, detectPackageManager, parseBuildErrors } from './build-validator.js';
 
 const DEFAULT_TYPECHECK_TIMEOUT_MS = 30_000;
 const DEFAULT_BUILD_TIMEOUT_MS = 60_000;
@@ -111,17 +111,63 @@ export async function runTypecheckValidation(
 }
 
 /**
- * Run build as a quick check, wrapping the existing runBuildValidation.
+ * Run build as a quick check using auto-detected build command.
+ * Supports JS (package.json), Go (go.mod), Elixir (mix.exs), .NET (*.csproj), Kotlin/Java (build.gradle).
+ * Returns passed when no build system detected — quick-checks are an optimization, not a requirement.
  */
 async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promise<QuickCheckResult> {
-  const buildResult = await runBuildValidation(projectDir, timeoutMs);
+  const startTime = Date.now();
+  const buildCmd = await detectBuildCommand(projectDir);
+
+  if (!buildCmd) {
+    return {
+      passed: true,
+      phase: 'build',
+      issues: [],
+      agentPrompt: null,
+      durationMs: Date.now() - startTime,
+    };
+  }
+
+  const { exitCode, stdout, stderr } = await spawnCommand(
+    buildCmd.command,
+    buildCmd.args,
+    projectDir,
+    timeoutMs,
+  );
+
+  if (exitCode === 0) {
+    return {
+      passed: true,
+      phase: 'build',
+      issues: [],
+      agentPrompt: null,
+      durationMs: Date.now() - startTime,
+    };
+  }
+
+  const output = stdout + stderr;
+  const errors = parseBuildErrors(output);
+  const issues: ValidationIssue[] = errors.length > 0
+    ? errors.map((e) => ({
+        type: 'file' as const,
+        severity: 'error' as const,
+        message: `Build error: ${e}`,
+        hint: 'Fix the error and run build again',
+      }))
+    : [{
+        type: 'file' as const,
+        severity: 'error' as const,
+        message: 'Build failed',
+        hint: `Run \`${buildCmd.command} ${buildCmd.args.join(' ')}\` to see full output`,
+      }];
 
   return {
-    passed: buildResult.success,
+    passed: false,
     phase: 'build',
-    issues: buildResult.issues,
-    agentPrompt: buildResult.success ? null : formatBuildErrors(buildResult.issues),
-    durationMs: buildResult.durationMs,
+    issues,
+    agentPrompt: formatBuildErrors(issues),
+    durationMs: Date.now() - startTime,
   };
 }
 

From e5d4abfb7ab60b711ea84528bfa42e9c00626634 Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 11:38:21 -0600
Subject: [PATCH 10/14] fix: bump first-attempt threshold to 80% and fix
 quality grader JSON parsing

Raise firstAttemptPassRate from 50% to 80% now that false positives
from non-TS projects are eliminated (85.7% observed in latest run).

Fix quality grader parsing: the greedy regex matched braces inside
<thinking> tags. Now extracts JSON only after </thinking> and uses
a non-greedy pattern to avoid capturing nested objects.
---
 tests/evals/graders/quality-grader.ts |  5 ++--
 tests/evals/success-criteria.spec.ts  | 34 ++++++++++++---------------
 tests/evals/success-criteria.ts       |  2 +-
 3 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/tests/evals/graders/quality-grader.ts b/tests/evals/graders/quality-grader.ts
index 91165a4..22d1bf6 100644
--- a/tests/evals/graders/quality-grader.ts
+++ b/tests/evals/graders/quality-grader.ts
@@ -88,8 +88,9 @@ Then, output your final scores as JSON.
       const thinkingMatch = text.match(/<thinking>([\s\S]*?)<\/thinking>/);
       const reasoning = thinkingMatch?.[1]?.trim() || 'No reasoning provided';
 
-      // Extract JSON scores (after thinking block)
-      const jsonMatch = text.match(/\{[\s\S]*\}/);
+      // Extract JSON scores — look after </thinking> tag to avoid matching braces in reasoning
+      const afterThinking = thinkingMatch ? text.slice(text.indexOf('</thinking>') + '</thinking>'.length) : text;
+      const jsonMatch = afterThinking.match(/\{[^{}]*\}/);
       if (!jsonMatch) return null;
 
       const parsed = JSON.parse(jsonMatch[0]) as Record<string, unknown>;
diff --git a/tests/evals/success-criteria.spec.ts b/tests/evals/success-criteria.spec.ts
index ce746bf..c0e5667 100644
--- a/tests/evals/success-criteria.spec.ts
+++ b/tests/evals/success-criteria.spec.ts
@@ -15,7 +15,7 @@ function makeResult(passed: boolean, attempts: number = 1, correctionAttempts: n
 describe('success-criteria', () => {
   describe('DEFAULT_CRITERIA', () => {
     it('has expected default thresholds', () => {
-      expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.5);
+      expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.8);
       expect(DEFAULT_CRITERIA.withCorrectionPassRate).toBe(0.9);
       expect(DEFAULT_CRITERIA.withRetryPassRate).toBe(0.95);
     });
@@ -23,14 +23,12 @@ describe('success-criteria', () => {
 
   describe('validateResults', () => {
     it('returns passed=true when all criteria met', () => {
-      // 10 results: 6 clean (60% > 50%), 3 corrected (9/10 = 90% correction), 1 retried (100% retry)
+      // 10 results: 8 clean (80%), 1 corrected (9/10 = 90% correction), 1 retried (100% retry)
       const results: EvalResult[] = [
-        ...Array(6)
+        ...Array(8)
           .fill(null)
           .map(() => makeResult(true, 1, 0)),
-        ...Array(3)
-          .fill(null)
-          .map(() => makeResult(true, 1, 1)),
+        makeResult(true, 1, 1),
         makeResult(true, 2),
       ];
 
@@ -38,18 +36,18 @@ describe('success-criteria', () => {
 
       expect(validation.passed).toBe(true);
       expect(validation.failures).toHaveLength(0);
-      expect(validation.actual.firstAttemptPassRate).toBe(0.6);
+      expect(validation.actual.firstAttemptPassRate).toBe(0.8);
       expect(validation.actual.withCorrectionPassRate).toBe(0.9);
       expect(validation.actual.withRetryPassRate).toBe(1);
     });
 
     it('returns passed=false when first-attempt rate below threshold', () => {
-      // 10 results: 4 clean (40% < 50%), 5 corrected (90% correction), 1 retried
+      // 10 results: 7 clean (70% < 80%), 2 corrected (90% correction), 1 retried
       const results: EvalResult[] = [
-        ...Array(4)
+        ...Array(7)
           .fill(null)
           .map(() => makeResult(true, 1, 0)),
-        ...Array(5)
+        ...Array(2)
           .fill(null)
           .map(() => makeResult(true, 1, 1)),
         makeResult(true, 2),
@@ -62,14 +60,12 @@ describe('success-criteria', () => {
     });
 
     it('returns passed=false when with-retry rate below threshold', () => {
-      // 10 results: 6 clean (60%), 3 corrected (90% correction), 1 failed → 90% retry < 95%
+      // 10 results: 8 clean (80%), 1 corrected (90% correction), 1 failed → 90% retry < 95%
       const results: EvalResult[] = [
-        ...Array(6)
+        ...Array(8)
           .fill(null)
           .map(() => makeResult(true, 1, 0)),
-        ...Array(3)
-          .fill(null)
-          .map(() => makeResult(true, 1, 1)),
+        makeResult(true, 1, 1),
         makeResult(false, 3),
       ];
 
@@ -133,15 +129,15 @@ describe('success-criteria', () => {
 
     it('passes when exactly at threshold', () => {
       // 20 results:
-      //  10 clean first-attempt (attempt=1, corrections=0) → 50% first-attempt
-      //   8 self-corrected (attempt=1, corrections=1)       → 18/20 = 90% with-correction
+      //  16 clean first-attempt (attempt=1, corrections=0) → 80% first-attempt
+      //   2 self-corrected (attempt=1, corrections=1)       → 18/20 = 90% with-correction
       //   1 passed on scenario retry (attempt=2)             → 19/20 = 95% with-retry
       //   1 failed (attempt=3)
       const results: EvalResult[] = [
-        ...Array(10)
+        ...Array(16)
           .fill(null)
           .map(() => makeResult(true, 1, 0)),
-        ...Array(8)
+        ...Array(2)
           .fill(null)
           .map(() => makeResult(true, 1, 1)),
         makeResult(true, 2),
diff --git a/tests/evals/success-criteria.ts b/tests/evals/success-criteria.ts
index 3373397..a2c2f63 100644
--- a/tests/evals/success-criteria.ts
+++ b/tests/evals/success-criteria.ts
@@ -17,7 +17,7 @@ export interface SuccessCriteria {
 
 /** Default thresholds for CI enforcement */
 export const DEFAULT_CRITERIA: SuccessCriteria = {
-  firstAttemptPassRate: 0.5,
+  firstAttemptPassRate: 0.8,
   withCorrectionPassRate: 0.9,
   withRetryPassRate: 0.95,
 };

From b21edf7925830d31a18d80b4dcab3257b486665c Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 11:59:43 -0600
Subject: [PATCH 11/14] chore: formatting

---
 src/lib/agent-interface.spec.ts              | 106 ++++++-------------
 src/lib/agent-interface.ts                   |   6 +-
 src/lib/agent-runner.ts                      |  11 +-
 src/lib/validation/quick-checks.spec.ts      |  60 +++--------
 src/lib/validation/quick-checks.ts           |  41 ++++---
 src/lib/validation/validator.ts              |  13 +--
 tests/evals/__tests__/agent-executor.spec.ts |   6 +-
 tests/evals/reporter.ts                      |   4 +-
 tests/evals/success-criteria.ts              |   6 +-
 9 files changed, 95 insertions(+), 158 deletions(-)

diff --git a/src/lib/agent-interface.spec.ts b/src/lib/agent-interface.spec.ts
index d627276..b266aec 100644
--- a/src/lib/agent-interface.spec.ts
+++ b/src/lib/agent-interface.spec.ts
@@ -10,7 +10,11 @@ const { mockQuery, mockConfig } = vi.hoisted(() => ({
     proxy: { refreshThresholdMs: 300000 },
     nodeVersion: '20',
     logging: { debugMode: false },
-    documentation: { workosDocsUrl: 'https://workos.com/docs', dashboardUrl: 'https://dashboard.workos.com', issuesUrl: 'https://github.com' },
+    documentation: {
+      workosDocsUrl: 'https://workos.com/docs',
+      dashboardUrl: 'https://dashboard.workos.com',
+      issuesUrl: 'https://github.com',
+    },
     frameworks: {},
     legacy: { oauthPort: 3000 },
     branding: { showAsciiArt: false, asciiArt: '', compactAsciiArt: '', useCompact: false },
@@ -149,37 +153,23 @@ describe('runAgent retry loop', () => {
   });
 
   it('returns retryCount=0 when no retryConfig provided', async () => {
-    mockQuery.mockImplementation(
-      createMockSDKResponse([{ text: 'Done!' }]),
-    );
+    mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Done!' }]));
 
-    const result = await runAgent(
-      makeAgentConfig(),
-      'Test prompt',
-      makeOptions(),
-      undefined,
-      emitter,
-    );
+    const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter);
 
     expect(result.error).toBeUndefined();
     expect(result.retryCount).toBe(0);
   });
 
   it('returns retryCount=0 when validation passes first try', async () => {
-    mockQuery.mockImplementation(
-      createMockSDKResponse([{ text: 'Done!' }]),
-    );
+    mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Done!' }]));
 
     const validateAndFormat = vi.fn().mockResolvedValue(null); // passes
 
-    const result = await runAgent(
-      makeAgentConfig(),
-      'Test prompt',
-      makeOptions(),
-      undefined,
-      emitter,
-      { maxRetries: 2, validateAndFormat },
-    );
+    const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, {
+      maxRetries: 2,
+      validateAndFormat,
+    });
 
     expect(result.error).toBeUndefined();
     expect(result.retryCount).toBe(0);
@@ -199,25 +189,17 @@ describe('runAgent retry loop', () => {
 
   it('retries once when validation fails then passes', async () => {
     // Two turns: initial + one retry
-    mockQuery.mockImplementation(
-      createMockSDKResponse([
-        { text: 'Initial attempt' },
-        { text: 'Fixed it!' },
-      ]),
-    );
+    mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Initial attempt' }, { text: 'Fixed it!' }]));
 
-    const validateAndFormat = vi.fn()
+    const validateAndFormat = vi
+      .fn()
       .mockResolvedValueOnce('Type error in src/foo.ts') // fail first
       .mockResolvedValueOnce(null); // pass second
 
-    const result = await runAgent(
-      makeAgentConfig(),
-      'Test prompt',
-      makeOptions(),
-      undefined,
-      emitter,
-      { maxRetries: 2, validateAndFormat },
-    );
+    const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, {
+      maxRetries: 2,
+      validateAndFormat,
+    });
 
     expect(result.error).toBeUndefined();
     expect(result.retryCount).toBe(1);
@@ -232,23 +214,15 @@ describe('runAgent retry loop', () => {
   it('caps at maxRetries when validation always fails', async () => {
     // Three turns: initial + 2 retries
     mockQuery.mockImplementation(
-      createMockSDKResponse([
-        { text: 'Attempt 1' },
-        { text: 'Attempt 2' },
-        { text: 'Attempt 3' },
-      ]),
+      createMockSDKResponse([{ text: 'Attempt 1' }, { text: 'Attempt 2' }, { text: 'Attempt 3' }]),
     );
 
     const validateAndFormat = vi.fn().mockResolvedValue('Still broken');
 
-    const result = await runAgent(
-      makeAgentConfig(),
-      'Test prompt',
-      makeOptions(),
-      undefined,
-      emitter,
-      { maxRetries: 2, validateAndFormat },
-    );
+    const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, {
+      maxRetries: 2,
+      validateAndFormat,
+    });
 
     expect(result.error).toBeUndefined();
     expect(result.retryCount).toBe(2);
@@ -261,20 +235,14 @@ describe('runAgent retry loop', () => {
   });
 
   it('preserves existing behavior with maxRetries=0', async () => {
-    mockQuery.mockImplementation(
-      createMockSDKResponse([{ text: 'Done!' }]),
-    );
+    mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Done!' }]));
 
     const validateAndFormat = vi.fn().mockResolvedValue('Error');
 
-    const result = await runAgent(
-      makeAgentConfig(),
-      'Test prompt',
-      makeOptions(),
-      undefined,
-      emitter,
-      { maxRetries: 0, validateAndFormat },
-    );
+    const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, {
+      maxRetries: 0,
+      validateAndFormat,
+    });
 
     expect(result.error).toBeUndefined();
     expect(result.retryCount).toBe(0);
@@ -283,20 +251,14 @@ describe('runAgent retry loop', () => {
   });
 
   it('treats validateAndFormat errors as passed', async () => {
-    mockQuery.mockImplementation(
-      createMockSDKResponse([{ text: 'Done!' }]),
-    );
+    mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Done!' }]));
 
     const validateAndFormat = vi.fn().mockRejectedValue(new Error('Validation crashed'));
 
-    const result = await runAgent(
-      makeAgentConfig(),
-      'Test prompt',
-      makeOptions(),
-      undefined,
-      emitter,
-      { maxRetries: 2, validateAndFormat },
-    );
+    const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, {
+      maxRetries: 2,
+      validateAndFormat,
+    });
 
     expect(result.error).toBeUndefined();
     expect(result.retryCount).toBe(0);
diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts
index 5b0018c..65b168f 100644
--- a/src/lib/agent-interface.ts
+++ b/src/lib/agent-interface.ts
@@ -634,7 +634,11 @@ export async function runAgent(
         resolveCurrentTurn();
       }
       // Let callers observe messages (e.g., for latency tracking in evals)
-      try { onMessage?.(message); } catch { /* observer errors are non-critical */ }
+      try {
+        onMessage?.(message);
+      } catch {
+        /* observer errors are non-critical */
+      }
     }
 
     const durationMs = Date.now() - startTime;
diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts
index b2eef7d..a84aef8 100644
--- a/src/lib/agent-runner.ts
+++ b/src/lib/agent-runner.ts
@@ -120,11 +120,12 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal
   };
 
   // Build retry config
-  const retryConfig: RetryConfig | undefined =
-    options.noValidate ? undefined : {
-      maxRetries: options.maxRetries ?? 2,
-      validateAndFormat,
-    };
+  const retryConfig: RetryConfig | undefined = options.noValidate
+    ? undefined
+    : {
+        maxRetries: options.maxRetries ?? 2,
+        validateAndFormat,
+      };
 
   // Run agent with retry support — agent gets correction prompts on validation failure
   const agentResult = await runAgent(
diff --git a/src/lib/validation/quick-checks.spec.ts b/src/lib/validation/quick-checks.spec.ts
index 190221f..a36dd1e 100644
--- a/src/lib/validation/quick-checks.spec.ts
+++ b/src/lib/validation/quick-checks.spec.ts
@@ -52,9 +52,7 @@ describe('runQuickChecks', () => {
   });
 
   it('returns passed=true when both typecheck and build succeed', async () => {
-    mockSpawn
-      .mockImplementationOnce(() => createMockProcess(0))
-      .mockImplementationOnce(() => createMockProcess(0));
+    mockSpawn.mockImplementationOnce(() => createMockProcess(0)).mockImplementationOnce(() => createMockProcess(0));
 
     const result = await runQuickChecks(testDir);
 
@@ -80,9 +78,7 @@ describe('runQuickChecks', () => {
   });
 
   it('runs build after typecheck passes', async () => {
-    mockSpawn
-      .mockImplementationOnce(() => createMockProcess(0))
-      .mockImplementationOnce(() => createMockProcess(0));
+    mockSpawn.mockImplementationOnce(() => createMockProcess(0)).mockImplementationOnce(() => createMockProcess(0));
 
     const result = await runQuickChecks(testDir);
 
@@ -103,7 +99,8 @@ describe('runQuickChecks', () => {
   });
 
   it('generates agentRetryPrompt when typecheck fails', async () => {
-    const tsError = "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'.";
+    const tsError =
+      "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'.";
     mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError));
 
     const result = await runQuickChecks(testDir);
@@ -114,9 +111,7 @@ describe('runQuickChecks', () => {
   });
 
   it('tracks total duration', async () => {
-    mockSpawn
-      .mockImplementationOnce(() => createMockProcess(0))
-      .mockImplementationOnce(() => createMockProcess(0));
+    mockSpawn.mockImplementationOnce(() => createMockProcess(0)).mockImplementationOnce(() => createMockProcess(0));
 
     const result = await runQuickChecks(testDir);
 
@@ -141,10 +136,7 @@ describe('runQuickChecks', () => {
 
   it('skips build when no build system detected (e.g., Python project)', async () => {
     // Rewrite testDir without a build script or any build system markers
-    writeFileSync(
-      join(testDir, 'package.json'),
-      JSON.stringify({ scripts: { typecheck: 'tsc --noEmit' } }),
-    );
+    writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { typecheck: 'tsc --noEmit' } }));
 
     mockSpawn.mockImplementationOnce(() => createMockProcess(0)); // typecheck pass only
 
@@ -216,8 +208,7 @@ describe('runTypecheckValidation', () => {
   });
 
   it('handles pretty-printed tsc errors (colon-separated format)', async () => {
-    const tsError =
-      "src/app.tsx:10:3 - error TS2322: Type 'number' is not assignable to type 'string'.";
+    const tsError = "src/app.tsx:10:3 - error TS2322: Type 'number' is not assignable to type 'string'.";
     mockSpawn.mockImplementationOnce(() => createMockProcess(1, tsError, ''));
 
     const result = await runTypecheckValidation(testDir);
@@ -227,9 +218,7 @@ describe('runTypecheckValidation', () => {
   });
 
   it('provides fallback message when errors cannot be parsed', async () => {
-    mockSpawn.mockImplementationOnce(() =>
-      createMockProcess(1, '', 'Some unknown error format that we cannot parse'),
-    );
+    mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', 'Some unknown error format that we cannot parse'));
 
     const result = await runTypecheckValidation(testDir);
 
@@ -243,35 +232,21 @@ describe('runTypecheckValidation', () => {
 
     await runTypecheckValidation(testDir);
 
-    expect(mockSpawn).toHaveBeenCalledWith(
-      'pnpm',
-      ['typecheck'],
-      expect.objectContaining({ cwd: testDir }),
-    );
+    expect(mockSpawn).toHaveBeenCalledWith('pnpm', ['typecheck'], expect.objectContaining({ cwd: testDir }));
   });
 
   it('falls back to npx tsc --noEmit when no typecheck script but tsconfig exists', async () => {
-    writeFileSync(
-      join(testDir, 'package.json'),
-      JSON.stringify({ scripts: { build: 'next build' } }),
-    );
+    writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'next build' } }));
     writeFileSync(join(testDir, 'tsconfig.json'), '{}');
     mockSpawn.mockImplementationOnce(() => createMockProcess(0));
 
     await runTypecheckValidation(testDir);
 
-    expect(mockSpawn).toHaveBeenCalledWith(
-      'npx',
-      ['tsc', '--noEmit'],
-      expect.objectContaining({ cwd: testDir }),
-    );
+    expect(mockSpawn).toHaveBeenCalledWith('npx', ['tsc', '--noEmit'], expect.objectContaining({ cwd: testDir }));
   });
 
   it('skips typecheck when no tsconfig.json and no typecheck script', async () => {
-    writeFileSync(
-      join(testDir, 'package.json'),
-      JSON.stringify({ scripts: { build: 'go build' } }),
-    );
+    writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'go build' } }));
     // No tsconfig.json — not a TypeScript project
 
     const result = await runTypecheckValidation(testDir);
@@ -282,19 +257,12 @@ describe('runTypecheckValidation', () => {
   });
 
   it('detects type-check script (hyphenated variant)', async () => {
-    writeFileSync(
-      join(testDir, 'package.json'),
-      JSON.stringify({ scripts: { 'type-check': 'tsc --noEmit' } }),
-    );
+    writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { 'type-check': 'tsc --noEmit' } }));
     mockSpawn.mockImplementationOnce(() => createMockProcess(0));
 
     await runTypecheckValidation(testDir);
 
-    expect(mockSpawn).toHaveBeenCalledWith(
-      'pnpm',
-      ['type-check'],
-      expect.objectContaining({ cwd: testDir }),
-    );
+    expect(mockSpawn).toHaveBeenCalledWith('pnpm', ['type-check'], expect.objectContaining({ cwd: testDir }));
   });
 
   it('tracks duration', async () => {
diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts
index 6db3132..3612209 100644
--- a/src/lib/validation/quick-checks.ts
+++ b/src/lib/validation/quick-checks.ts
@@ -19,10 +19,7 @@ export async function runQuickChecks(
   const results: QuickCheckResult[] = [];
 
   // Step 1: Typecheck
-  const typecheckResult = await runTypecheckValidation(
-    projectDir,
-    options?.timeoutMs ?? DEFAULT_TYPECHECK_TIMEOUT_MS,
-  );
+  const typecheckResult = await runTypecheckValidation(projectDir, options?.timeoutMs ?? DEFAULT_TYPECHECK_TIMEOUT_MS);
   results.push(typecheckResult);
 
   // Step 2: Build — only if typecheck passed and build not skipped
@@ -129,12 +126,7 @@ async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promis
     };
   }
 
-  const { exitCode, stdout, stderr } = await spawnCommand(
-    buildCmd.command,
-    buildCmd.args,
-    projectDir,
-    timeoutMs,
-  );
+  const { exitCode, stdout, stderr } = await spawnCommand(buildCmd.command, buildCmd.args, projectDir, timeoutMs);
 
   if (exitCode === 0) {
     return {
@@ -148,19 +140,22 @@ async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promis
 
   const output = stdout + stderr;
   const errors = parseBuildErrors(output);
-  const issues: ValidationIssue[] = errors.length > 0
-    ? errors.map((e) => ({
-        type: 'file' as const,
-        severity: 'error' as const,
-        message: `Build error: ${e}`,
-        hint: 'Fix the error and run build again',
-      }))
-    : [{
-        type: 'file' as const,
-        severity: 'error' as const,
-        message: 'Build failed',
-        hint: `Run \`${buildCmd.command} ${buildCmd.args.join(' ')}\` to see full output`,
-      }];
+  const issues: ValidationIssue[] =
+    errors.length > 0
+      ? errors.map((e) => ({
+          type: 'file' as const,
+          severity: 'error' as const,
+          message: `Build error: ${e}`,
+          hint: 'Fix the error and run build again',
+        }))
+      : [
+          {
+            type: 'file' as const,
+            severity: 'error' as const,
+            message: 'Build failed',
+            hint: `Run \`${buildCmd.command} ${buildCmd.args.join(' ')}\` to see full output`,
+          },
+        ];
 
   return {
     passed: false,
diff --git a/src/lib/validation/validator.ts b/src/lib/validation/validator.ts
index b35a2fa..6acaa43 100644
--- a/src/lib/validation/validator.ts
+++ b/src/lib/validation/validator.ts
@@ -30,12 +30,12 @@ export async function validateInstallation(
   }
 
   // Run validations
-  issues.push(...await validatePackages(rules, projectDir));
-  issues.push(...await validateEnvVars(rules, projectDir));
-  issues.push(...await validateFiles(rules, projectDir));
+  issues.push(...(await validatePackages(rules, projectDir)));
+  issues.push(...(await validateEnvVars(rules, projectDir)));
+  issues.push(...(await validateFiles(rules, projectDir)));
 
   // Run framework-specific cross-validations
-  issues.push(...await validateFrameworkSpecific(framework, projectDir));
+  issues.push(...(await validateFrameworkSpecific(framework, projectDir)));
 
   // Run build validation if enabled
   if (options.runBuild !== false) {
@@ -220,10 +220,7 @@ export async function validateFiles(rules: ValidationRules, projectDir: string):
 /**
  * Framework-specific cross-validations that require reading multiple sources.
  */
-export async function validateFrameworkSpecific(
-  framework: string,
-  projectDir: string,
-): Promise<ValidationIssue[]> {
+export async function validateFrameworkSpecific(framework: string, projectDir: string): Promise<ValidationIssue[]> {
   const issues: ValidationIssue[] = [];
 
   // Universal cross-validations
diff --git a/tests/evals/__tests__/agent-executor.spec.ts b/tests/evals/__tests__/agent-executor.spec.ts
index 2ca23bc..2316a18 100644
--- a/tests/evals/__tests__/agent-executor.spec.ts
+++ b/tests/evals/__tests__/agent-executor.spec.ts
@@ -13,7 +13,11 @@ const { mockRunAgent, mockConfig, mockCredentials } = vi.hoisted(() => ({
     proxy: { refreshThresholdMs: 300000 },
     nodeVersion: '20',
     logging: { debugMode: false },
-    documentation: { workosDocsUrl: 'https://workos.com/docs', dashboardUrl: 'https://dashboard.workos.com', issuesUrl: 'https://github.com' },
+    documentation: {
+      workosDocsUrl: 'https://workos.com/docs',
+      dashboardUrl: 'https://dashboard.workos.com',
+      issuesUrl: 'https://github.com',
+    },
     frameworks: {},
     legacy: { oauthPort: 3000 },
     branding: { showAsciiArt: false, asciiArt: '', compactAsciiArt: '', useCompact: false },
diff --git a/tests/evals/reporter.ts b/tests/evals/reporter.ts
index 297641e..316dd7e 100644
--- a/tests/evals/reporter.ts
+++ b/tests/evals/reporter.ts
@@ -60,7 +60,9 @@ export function printMatrix(results: EvalResult[]): void {
   const total = results.length;
   const rate = ((passed / total) * 100).toFixed(1);
   const selfCorrected = results.filter((r) => r.selfCorrected).length;
-  console.log(`\nResults: ${passed}/${total} passed (${rate}%)${selfCorrected > 0 ? `, ${selfCorrected} self-corrected` : ''}`);
+  console.log(
+    `\nResults: ${passed}/${total} passed (${rate}%)${selfCorrected > 0 ? `, ${selfCorrected} self-corrected` : ''}`,
+  );
 
   if (passed < total) {
     console.log('\nFailed scenarios:');
diff --git a/tests/evals/success-criteria.ts b/tests/evals/success-criteria.ts
index a2c2f63..6805ed0 100644
--- a/tests/evals/success-criteria.ts
+++ b/tests/evals/success-criteria.ts
@@ -70,7 +70,11 @@ export function validateResults(results: EvalResult[], criteria: SuccessCriteria
   return {
     passed: failures.length === 0,
     criteria,
-    actual: { firstAttemptPassRate: firstAttemptRate, withCorrectionPassRate: withCorrectionRate, withRetryPassRate: withRetryRate },
+    actual: {
+      firstAttemptPassRate: firstAttemptRate,
+      withCorrectionPassRate: withCorrectionRate,
+      withRetryPassRate: withRetryRate,
+    },
     failures,
   };
 }

From 719fd6b8669d3eec5ab6d8c768bfd94ea3c067d4 Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 12:06:52 -0600
Subject: [PATCH 12/14] chore: remove comment slop and dead validation:quick
 events

---
 src/lib/agent-interface.ts            | 16 ++--------------
 src/lib/events.ts                     |  7 -------
 src/lib/validation/build-validator.ts |  9 +--------
 src/lib/validation/quick-checks.ts    |  5 -----
 4 files changed, 3 insertions(+), 34 deletions(-)

diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts
index 65b168f..6b1b3b5 100644
--- a/src/lib/agent-interface.ts
+++ b/src/lib/agent-interface.ts
@@ -536,7 +536,6 @@ export async function runAgent(
     resetTurnSignal();
 
     const createPromptStream = async function* () {
-      // Initial prompt
       yield {
         type: 'user',
         session_id: '',
@@ -544,13 +543,10 @@ export async function runAgent(
         parent_tool_use_id: null,
       };
 
-      // Retry loop — yield follow-up correction prompts on validation failure
       if (retryConfig && maxRetries > 0) {
         while (retryCount < maxRetries) {
-          // Wait for agent to finish current turn
           await currentTurnDone;
 
-          // Run validation between turns
           emitter?.emit('validation:retry:start', { attempt: retryCount + 1 });
 
           let validationPrompt: string | null;
@@ -567,14 +563,13 @@ export async function runAgent(
             passed: validationPrompt === null,
           });
 
-          if (validationPrompt === null) break; // Validation passed
+          if (validationPrompt === null) break;
 
           retryCount++;
           emitter?.emit('agent:retry', { attempt: retryCount, maxRetries });
 
           resetTurnSignal();
 
-          // Feed errors back to agent in same conversation
           yield {
             type: 'user',
             session_id: '',
@@ -584,7 +579,6 @@ export async function runAgent(
         }
       }
 
-      // Keep generator alive until the final result is received
       await currentTurnDone;
     };
 
@@ -629,16 +623,10 @@ export async function runAgent(
       if (messageError) {
         sdkError = messageError;
       }
-      // Signal turn completion when result received — this resumes the generator
       if (message.type === 'result') {
         resolveCurrentTurn();
       }
-      // Let callers observe messages (e.g., for latency tracking in evals)
-      try {
-        onMessage?.(message);
-      } catch {
-        /* observer errors are non-critical */
-      }
+      try { onMessage?.(message); } catch { /* non-critical */ }
     }
 
     const durationMs = Date.now() - startTime;
diff --git a/src/lib/events.ts b/src/lib/events.ts
index e0a2279..027bd31 100644
--- a/src/lib/events.ts
+++ b/src/lib/events.ts
@@ -57,13 +57,6 @@ export interface InstallerEvents {
   'validation:retry:start': { attempt: number };
   'validation:retry:complete': { attempt: number; passed: boolean };
 
-  'validation:quick:start': Record<string, never>;
-  'validation:quick:complete': {
-    passed: boolean;
-    results: import('./validation/types.js').QuickCheckResult[];
-    durationMs: number;
-  };
-
   'validation:start': { framework: string };
   'validation:issues': { issues: import('./validation/types.js').ValidationIssue[] };
   'validation:complete': { passed: boolean; issueCount: number; durationMs: number };
diff --git a/src/lib/validation/build-validator.ts b/src/lib/validation/build-validator.ts
index 52836c5..854f96d 100644
--- a/src/lib/validation/build-validator.ts
+++ b/src/lib/validation/build-validator.ts
@@ -125,41 +125,34 @@ export interface BuildCommand {
  * Returns null if no build system detected — caller should skip build validation.
  */
 export async function detectBuildCommand(projectDir: string): Promise<BuildCommand | null> {
-  // 1. package.json with build script (JS/TS frameworks)
   const pm = detectPackageManager(projectDir);
   if (await hasBuildScriptInPackageJson(projectDir)) {
     const args = pm === 'npm' ? ['run', 'build'] : ['build'];
     return { command: pm, args };
   }
 
-  // 2. Go (go.mod → go build ./...)
   if (existsSync(join(projectDir, 'go.mod'))) {
     return { command: 'go', args: ['build', './...'] };
   }
 
-  // 3. Elixir (mix.exs → mix compile)
   if (existsSync(join(projectDir, 'mix.exs'))) {
     return { command: 'mix', args: ['compile'] };
   }
 
-  // 4. .NET (*.csproj → dotnet build)
   try {
     const files = readdirSync(projectDir);
     if (files.some((f) => f.endsWith('.csproj'))) {
       return { command: 'dotnet', args: ['build'] };
     }
   } catch {
-    // Can't read directory — skip
+    // Can't read directory
   }
 
-  // 5. Kotlin/Java (build.gradle.kts or build.gradle → gradlew/gradle build)
   if (existsSync(join(projectDir, 'build.gradle.kts')) || existsSync(join(projectDir, 'build.gradle'))) {
     const gradlew = existsSync(join(projectDir, 'gradlew')) ? './gradlew' : 'gradle';
     return { command: gradlew, args: ['build'] };
   }
 
-  // Interpreted languages (Python, Ruby, PHP) have no universal build command.
-  // Return null — quick-checks will skip the build step silently.
   return null;
 }
 
diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts
index 3612209..7ceab1b 100644
--- a/src/lib/validation/quick-checks.ts
+++ b/src/lib/validation/quick-checks.ts
@@ -109,8 +109,6 @@ export async function runTypecheckValidation(
 
 /**
  * Run build as a quick check using auto-detected build command.
- * Supports JS (package.json), Go (go.mod), Elixir (mix.exs), .NET (*.csproj), Kotlin/Java (build.gradle).
- * Returns passed when no build system detected — quick-checks are an optimization, not a requirement.
  */
 async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promise<QuickCheckResult> {
   const startTime = Date.now();
@@ -173,7 +171,6 @@ interface TypecheckCommand {
 
 /**
  * Detect the appropriate typecheck command for the project.
- * Checks for tsc in node_modules, then framework-specific alternatives.
  */
 async function detectTypecheckCommand(projectDir: string): Promise<TypecheckCommand | null> {
   const pm = detectPackageManager(projectDir);
@@ -234,7 +231,6 @@ function parseTypecheckErrors(output: string): string[] {
 
 /**
  * Format typecheck errors into an agent-ready prompt.
- * Turns "TS2345: Argument of type..." into actionable instructions.
  */
 function formatTypecheckErrors(errors: string[], rawOutput: string): string {
   if (errors.length === 0) {
@@ -267,7 +263,6 @@ function formatBuildErrors(issues: ValidationIssue[]): string {
 
 /**
  * Format quick check failures into an agent-ready prompt.
- * Combines typecheck and build errors into a single actionable prompt.
  */
 function formatForAgent(results: QuickCheckResult[]): string {
   const failedResults = results.filter((r) => !r.passed);

From 7cd7147b8d095760fed5fda79e1cb805ab7836de Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 15:31:57 -0600
Subject: [PATCH 13/14] refactor: simplify quick-checks, extract shared
 validateAndFormat, remove dead code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extract passResult helper (4 identical object literals → 1 function),
unify parseTypecheckErrors into single regex with Set dedup, extract
quickCheckValidateAndFormat shared between agent-runner and eval
executor, remove getIntegration indirection and dead continueUrl param.
---
 src/lib/agent-interface.ts                   |   5 +-
 src/lib/agent-runner.ts                      |  49 ++-----
 src/lib/validation/index.ts                  |   2 +-
 src/lib/validation/quick-checks.ts           | 142 +++++--------------
 tests/evals/__tests__/agent-executor.spec.ts |   2 +-
 tests/evals/agent-executor.ts                |  52 ++-----
 tests/evals/parallel-runner.ts               |   6 +-
 7 files changed, 61 insertions(+), 197 deletions(-)

diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts
index 6b1b3b5..dbe0c99 100644
--- a/src/lib/agent-interface.ts
+++ b/src/lib/agent-interface.ts
@@ -519,12 +519,11 @@ export async function runAgent(
   const collectedText: string[] = [];
 
   try {
-    // Retry loop coordination
     let retryCount = 0;
     const maxRetries = retryConfig?.maxRetries ?? 0;
 
-    // Turn completion signals — the response loop resolves currentTurnDone
-    // when a 'result' message arrives. The generator awaits it between turns.
+    // Turn completion signals — resolveCurrentTurn is called when a 'result'
+    // message arrives; the prompt generator awaits currentTurnDone between turns.
     let resolveCurrentTurn!: () => void;
     let currentTurnDone!: Promise<void>;
 
diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts
index a84aef8..b554d73 100644
--- a/src/lib/agent-runner.ts
+++ b/src/lib/agent-runner.ts
@@ -1,5 +1,5 @@
 import { SPINNER_MESSAGE, type FrameworkConfig } from './framework-config.js';
-import { validateInstallation, runQuickChecks } from './validation/index.js';
+import { validateInstallation, quickCheckValidateAndFormat } from './validation/index.js';
 import type { InstallerOptions } from '../utils/types.js';
 import {
   ensurePackageIsInstalled,
@@ -113,18 +113,11 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal
     options,
   );
 
-  // Build validation callback for retry loop — uses quick checks from Phase 1
-  const validateAndFormat = async (workingDirectory: string): Promise<string | null> => {
-    const quickResult = await runQuickChecks(workingDirectory);
-    return quickResult.passed ? null : quickResult.agentRetryPrompt;
-  };
-
-  // Build retry config
   const retryConfig: RetryConfig | undefined = options.noValidate
     ? undefined
     : {
         maxRetries: options.maxRetries ?? 2,
-        validateAndFormat,
+        validateAndFormat: quickCheckValidateAndFormat,
       };
 
   // Run agent with retry support — agent gets correction prompts on validation failure
@@ -190,12 +183,6 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal
     });
   }
 
-  // Skip MCP server setup for now (WorkOS doesn't need it initially)
-  // await addMCPServerToClientsStep({ ... });
-
-  // Build outro message
-  const continueUrl = undefined; // No signup flow for WorkOS wizard
-
   const changes = [
     ...config.ui.getOutroChanges(frameworkContext),
     Object.keys(envVars).length > 0 ? `Added environment variables to .env file` : '',
@@ -209,8 +196,7 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal
       : '',
   ].filter(Boolean);
 
-  // Build detailed summary to return to caller (state machine)
-  const summary = buildCompletionSummary(config, changes, nextSteps, continueUrl);
+  const summary = buildCompletionSummary(config, changes, nextSteps);
 
   await analytics.shutdown('success');
 
@@ -277,41 +263,24 @@ Report your progress using [STATUS] prefixes.
 Begin by invoking the ${skillName} skill.`;
 }
 
-/**
- * Build a completion summary for the event payload.
- * This is a plain-text summary without styling (adapters handle presentation).
- */
-function buildCompletionSummary(
-  config: FrameworkConfig,
-  changes: string[],
-  nextSteps: string[],
-  continueUrl: string | undefined,
-): string {
-  const lines: string[] = [];
-
-  lines.push('Successfully installed WorkOS AuthKit!');
-  lines.push('');
+function buildCompletionSummary(config: FrameworkConfig, changes: string[], nextSteps: string[]): string {
+  const lines: string[] = ['Successfully installed WorkOS AuthKit!', ''];
 
   if (changes.length > 0) {
     lines.push('What the agent did:');
-    changes.forEach((change) => lines.push(`• ${change}`));
+    for (const change of changes) lines.push(`• ${change}`);
     lines.push('');
   }
 
   if (nextSteps.length > 0) {
     lines.push('Next steps:');
-    nextSteps.forEach((step) => lines.push(`• ${step}`));
+    for (const step of nextSteps) lines.push(`• ${step}`);
     lines.push('');
   }
 
-  lines.push(`Learn more: ${config.metadata.docsUrl}`);
-
-  if (continueUrl) {
-    lines.push(`Continue onboarding: ${continueUrl}`);
-  }
-
-  lines.push('');
   lines.push(
+    `Learn more: ${config.metadata.docsUrl}`,
+    '',
     'Note: This installer uses an LLM agent to analyze and modify your project. Please review the changes made.',
   );
 
diff --git a/src/lib/validation/index.ts b/src/lib/validation/index.ts
index c450b97..26c6316 100644
--- a/src/lib/validation/index.ts
+++ b/src/lib/validation/index.ts
@@ -7,7 +7,7 @@ export {
   type ValidateOptions,
 } from './validator.js';
 export { runBuildValidation, type BuildResult } from './build-validator.js';
-export { runQuickChecks, runTypecheckValidation } from './quick-checks.js';
+export { runQuickChecks, runTypecheckValidation, quickCheckValidateAndFormat } from './quick-checks.js';
 export type {
   ValidationResult,
   ValidationRules,
diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts
index 7ceab1b..2a3ec92 100644
--- a/src/lib/validation/quick-checks.ts
+++ b/src/lib/validation/quick-checks.ts
@@ -18,14 +18,11 @@ export async function runQuickChecks(
   const startTime = Date.now();
   const results: QuickCheckResult[] = [];
 
-  // Step 1: Typecheck
   const typecheckResult = await runTypecheckValidation(projectDir, options?.timeoutMs ?? DEFAULT_TYPECHECK_TIMEOUT_MS);
   results.push(typecheckResult);
 
-  // Step 2: Build — only if typecheck passed and build not skipped
   if (typecheckResult.passed && !options?.skipBuild) {
-    const buildResult = await runBuildQuickCheck(projectDir, options?.timeoutMs ?? DEFAULT_BUILD_TIMEOUT_MS);
-    results.push(buildResult);
+    results.push(await runBuildQuickCheck(projectDir, options?.timeoutMs ?? DEFAULT_BUILD_TIMEOUT_MS));
   }
 
   const passed = results.every((r) => r.passed);
@@ -38,6 +35,10 @@ export async function runQuickChecks(
   };
 }
 
+function passResult(phase: QuickCheckResult['phase'], startTime: number): QuickCheckResult {
+  return { passed: true, phase, issues: [], agentPrompt: null, durationMs: Date.now() - startTime };
+}
+
 /**
  * Run typecheck only (tsc --noEmit or framework equivalent).
  * Faster than full build — catches type errors in ~5s.
@@ -50,14 +51,7 @@ export async function runTypecheckValidation(
   const typecheckCmd = await detectTypecheckCommand(projectDir);
 
   if (!typecheckCmd) {
-    // No typecheck available — pass through
-    return {
-      passed: true,
-      phase: 'typecheck',
-      issues: [],
-      agentPrompt: null,
-      durationMs: Date.now() - startTime,
-    };
+    return passResult('typecheck', startTime);
   }
 
   const { exitCode, stdout, stderr } = await spawnCommand(
@@ -68,25 +62,18 @@ export async function runTypecheckValidation(
   );
 
   if (exitCode === 0) {
-    return {
-      passed: true,
-      phase: 'typecheck',
-      issues: [],
-      agentPrompt: null,
-      durationMs: Date.now() - startTime,
-    };
+    return passResult('typecheck', startTime);
   }
 
   const output = stdout + stderr;
   const errors = parseTypecheckErrors(output);
   const issues: ValidationIssue[] = errors.map((error) => ({
-    type: 'file' as const,
-    severity: 'error' as const,
+    type: 'file',
+    severity: 'error',
     message: `Type error: ${error}`,
     hint: 'Fix the type error and run typecheck again',
   }));
 
-  // Fallback if no specific errors parsed
   if (issues.length === 0) {
     issues.push({
       type: 'file',
@@ -96,44 +83,27 @@ export async function runTypecheckValidation(
     });
   }
 
-  const agentPrompt = formatTypecheckErrors(errors, output);
-
   return {
     passed: false,
     phase: 'typecheck',
     issues,
-    agentPrompt,
+    agentPrompt: formatTypecheckErrors(errors, output),
     durationMs: Date.now() - startTime,
   };
 }
 
-/**
- * Run build as a quick check using auto-detected build command.
- */
 async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promise<QuickCheckResult> {
   const startTime = Date.now();
   const buildCmd = await detectBuildCommand(projectDir);
 
   if (!buildCmd) {
-    return {
-      passed: true,
-      phase: 'build',
-      issues: [],
-      agentPrompt: null,
-      durationMs: Date.now() - startTime,
-    };
+    return passResult('build', startTime);
   }
 
   const { exitCode, stdout, stderr } = await spawnCommand(buildCmd.command, buildCmd.args, projectDir, timeoutMs);
 
   if (exitCode === 0) {
-    return {
-      passed: true,
-      phase: 'build',
-      issues: [],
-      agentPrompt: null,
-      durationMs: Date.now() - startTime,
-    };
+    return passResult('build', startTime);
   }
 
   const output = stdout + stderr;
@@ -141,15 +111,15 @@ async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promis
   const issues: ValidationIssue[] =
     errors.length > 0
       ? errors.map((e) => ({
-          type: 'file' as const,
-          severity: 'error' as const,
+          type: 'file',
+          severity: 'error',
           message: `Build error: ${e}`,
           hint: 'Fix the error and run build again',
         }))
       : [
           {
-            type: 'file' as const,
-            severity: 'error' as const,
+            type: 'file',
+            severity: 'error',
             message: 'Build failed',
             hint: `Run \`${buildCmd.command} ${buildCmd.args.join(' ')}\` to see full output`,
           },
@@ -169,69 +139,39 @@ interface TypecheckCommand {
   args: string[];
 }
 
-/**
- * Detect the appropriate typecheck command for the project.
- */
 async function detectTypecheckCommand(projectDir: string): Promise<TypecheckCommand | null> {
   const pm = detectPackageManager(projectDir);
 
-  // Check for typecheck script in package.json first
   try {
     const content = await readFile(join(projectDir, 'package.json'), 'utf-8');
     const pkg = JSON.parse(content) as { scripts?: Record<string, string> };
 
-    if (pkg.scripts?.typecheck) {
-      const args = pm === 'npm' ? ['run', 'typecheck'] : ['typecheck'];
-      return { command: pm, args };
-    }
-
-    if (pkg.scripts?.['type-check']) {
-      const args = pm === 'npm' ? ['run', 'type-check'] : ['type-check'];
+    const scriptName = pkg.scripts?.typecheck ? 'typecheck' : pkg.scripts?.['type-check'] ? 'type-check' : null;
+    if (scriptName) {
+      const args = pm === 'npm' ? ['run', scriptName] : [scriptName];
       return { command: pm, args };
     }
   } catch {
-    // No package.json or malformed — continue detection
+    // No package.json or malformed
   }
 
-  // Only fall back to tsc if the project actually uses TypeScript
   try {
     await readFile(join(projectDir, 'tsconfig.json'), 'utf-8');
     return { command: 'npx', args: ['tsc', '--noEmit'] };
   } catch {
-    // No tsconfig.json — not a TypeScript project, skip typecheck
     return null;
   }
 }
 
-/**
- * Parse TypeScript-specific errors from typecheck output.
- */
 function parseTypecheckErrors(output: string): string[] {
-  const errors: string[] = [];
-
-  // TypeScript errors: "src/file.ts(line,col): error TS2345: ..."
-  const tsErrors = output.match(/[\w./]+\.\w+\(\d+,\d+\):\s*error\s+TS\d+:.+/g);
-  if (tsErrors) {
-    errors.push(...tsErrors.slice(0, 10));
-  }
-
-  // Also match "src/file.ts:line:col - error TS2345: ..." (tsc --pretty format)
-  const prettyErrors = output.match(/[\w./]+\.\w+:\d+:\d+\s*-\s*error\s+TS\d+:.+/g);
-  if (prettyErrors) {
-    // Dedupe with existing errors
-    for (const err of prettyErrors.slice(0, 10)) {
-      if (!errors.some((e) => e.includes(err.split(':')[0]))) {
-        errors.push(err);
-      }
-    }
-  }
-
-  return errors.slice(0, 10);
+  // Match both TS error formats:
+  //   src/file.ts(line,col): error TS2345: ...
+  //   src/file.ts:line:col - error TS2345: ...  (tsc --pretty)
+  const pattern = /[\w./]+\.\w+(?:\(\d+,\d+\):\s*|:\d+:\d+\s*-\s*)error\s+TS\d+:.+/g;
+  const matches = output.match(pattern);
+  return matches ? [...new Set(matches)].slice(0, 10) : [];
 }
 
-/**
- * Format typecheck errors into an agent-ready prompt.
- */
 function formatTypecheckErrors(errors: string[], rawOutput: string): string {
   if (errors.length === 0) {
     // Couldn't parse specific errors — give raw output
@@ -253,35 +193,27 @@ function formatTypecheckErrors(errors: string[], rawOutput: string): string {
   return `The typecheck failed with ${errors.length} error${errors.length === 1 ? '' : 's'}:\n\n${lines.join('\n')}\n\nFix these type errors in the indicated files.`;
 }
 
-/**
- * Format build errors into an agent-ready prompt.
- */
 function formatBuildErrors(issues: ValidationIssue[]): string {
   const errorMessages = issues.map((i) => `- ${i.message}`);
   return `The build failed:\n\n${errorMessages.join('\n')}\n\nFix these build errors.`;
 }
 
-/**
- * Format quick check failures into an agent-ready prompt.
- */
 function formatForAgent(results: QuickCheckResult[]): string {
-  const failedResults = results.filter((r) => !r.passed);
-  if (failedResults.length === 0) return '';
-
-  const parts: string[] = [];
-
-  for (const result of failedResults) {
-    if (result.agentPrompt) {
-      parts.push(result.agentPrompt);
-    }
-  }
-
-  return parts.join('\n\n');
+  return results
+    .filter((r) => !r.passed && r.agentPrompt)
+    .map((r) => r.agentPrompt!)
+    .join('\n\n');
 }
 
 /**
- * Spawn a command and collect output.
+ * Validation callback suitable for RetryConfig.validateAndFormat.
+ * Returns null if checks pass, or an agent-ready error prompt if they fail.
  */
+export async function quickCheckValidateAndFormat(workingDirectory: string): Promise<string | null> {
+  const result = await runQuickChecks(workingDirectory);
+  return result.passed ? null : result.agentRetryPrompt;
+}
+
 function spawnCommand(
   command: string,
   args: string[],
diff --git a/tests/evals/__tests__/agent-executor.spec.ts b/tests/evals/__tests__/agent-executor.spec.ts
index 2316a18..62f057d 100644
--- a/tests/evals/__tests__/agent-executor.spec.ts
+++ b/tests/evals/__tests__/agent-executor.spec.ts
@@ -52,7 +52,7 @@ vi.mock('../../../src/lib/settings.js', () => ({
 }));
 
 vi.mock('../../../src/lib/validation/quick-checks.js', () => ({
-  runQuickChecks: vi.fn(),
+  quickCheckValidateAndFormat: vi.fn(),
 }));
 
 // Mock debug/analytics that agent-interface transitively imports
diff --git a/tests/evals/agent-executor.ts b/tests/evals/agent-executor.ts
index 73c7e98..0b244d2 100644
--- a/tests/evals/agent-executor.ts
+++ b/tests/evals/agent-executor.ts
@@ -5,7 +5,7 @@ import { writeEnvLocal } from '../../src/lib/env-writer.js';
 import { parseEnvFile } from '../../src/utils/env-parser.js';
 import { getConfig } from '../../src/lib/settings.js';
 import { LatencyTracker } from './latency-tracker.js';
-import { runQuickChecks } from '../../src/lib/validation/quick-checks.js';
+import { quickCheckValidateAndFormat } from '../../src/lib/validation/quick-checks.js';
 import { runAgent, type AgentRunConfig, type RetryConfig } from '../../src/lib/agent-interface.js';
 import type { InstallerOptions } from '../../src/utils/types.js';
 import type { ToolCall, LatencyMetrics } from './types.js';
@@ -91,19 +91,16 @@ export class AgentExecutor {
 
   async run(retryConfig?: AgentRetryConfig): Promise<AgentResult> {
     const config = retryConfig ?? { enabled: true, maxRetries: 2 };
-    const integration = this.getIntegration();
     const toolCalls: ToolCall[] = [];
     const collectedOutput: string[] = [];
 
     const label = this.options.scenarioName ? `[${this.options.scenarioName}]` : '';
     if (this.options.verbose) {
-      console.log(`${label} Initializing agent for ${integration}...`);
+      console.log(`${label} Initializing agent for ${this.framework}...`);
     }
 
-    // Start latency tracking
     this.latencyTracker.start();
 
-    // Write credentials to appropriate env file based on framework
     const envVars = {
       WORKOS_API_KEY: this.credentials.workosApiKey,
       WORKOS_CLIENT_ID: this.credentials.workosClientId,
@@ -115,21 +112,18 @@ export class AgentExecutor {
       writeEnvFile(this.workDir, envVars);
     }
 
-    // Build prompt
-    const skillName = SKILL_NAMES[integration];
+    const skillName = SKILL_NAMES[this.framework];
     const prompt = this.buildPrompt(skillName);
 
-    // Build SDK environment for direct mode
     const sdkEnv: Record<string, string | undefined> = {
       ...process.env,
       ANTHROPIC_API_KEY: this.credentials.anthropicApiKey,
+      ANTHROPIC_BASE_URL: undefined,
+      ANTHROPIC_AUTH_TOKEN: undefined,
       CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: 'true',
       CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: 'true',
     };
-    delete sdkEnv.ANTHROPIC_BASE_URL;
-    delete sdkEnv.ANTHROPIC_AUTH_TOKEN;
 
-    // Construct AgentRunConfig directly (bypasses initializeAgent/gateway auth)
     const agentRunConfig: AgentRunConfig = {
       workingDirectory: this.workDir,
       mcpServers: {
@@ -143,7 +137,6 @@ export class AgentExecutor {
       sdkEnv,
     };
 
-    // Thin InstallerOptions — only what runAgent needs
     const installerOptions: InstallerOptions = {
       debug: this.options.verbose ?? false,
       forceInstall: false,
@@ -153,15 +146,8 @@ export class AgentExecutor {
       skipAuth: true,
     };
 
-    // Build production RetryConfig with validateAndFormat callback
     const prodRetryConfig: RetryConfig | undefined = config.enabled
-      ? {
-          maxRetries: config.maxRetries,
-          validateAndFormat: async (workingDirectory: string): Promise<string | null> => {
-            const quickResult = await runQuickChecks(workingDirectory);
-            return quickResult.passed ? null : quickResult.agentRetryPrompt;
-          },
-        }
+      ? { maxRetries: config.maxRetries, validateAndFormat: quickCheckValidateAndFormat }
       : undefined;
 
     try {
@@ -178,34 +164,19 @@ export class AgentExecutor {
 
       const latencyMetrics = this.latencyTracker.finish();
       const correctionAttempts = result.retryCount ?? 0;
+      const base = { output: collectedOutput.join('\n'), toolCalls, latencyMetrics, correctionAttempts };
 
       if (result.error) {
-        return {
-          success: false,
-          output: collectedOutput.join('\n'),
-          toolCalls,
-          latencyMetrics,
-          error: result.errorMessage ?? String(result.error),
-          correctionAttempts,
-          selfCorrected: false,
-        };
+        return { ...base, success: false, error: result.errorMessage ?? String(result.error), selfCorrected: false };
       }
 
-      return {
-        success: true,
-        output: collectedOutput.join('\n'),
-        toolCalls,
-        latencyMetrics,
-        correctionAttempts,
-        selfCorrected: correctionAttempts > 0,
-      };
+      return { ...base, success: true, selfCorrected: correctionAttempts > 0 };
     } catch (error) {
-      const latencyMetrics = this.latencyTracker.finish();
       return {
         success: false,
         output: collectedOutput.join('\n'),
         toolCalls,
-        latencyMetrics,
+        latencyMetrics: this.latencyTracker.finish(),
         error: error instanceof Error ? error.message : String(error),
         correctionAttempts: 0,
         selfCorrected: false,
@@ -273,7 +244,4 @@ Begin by invoking the ${skillName} skill.`;
     }
   }
 
-  private getIntegration(): string {
-    return this.framework;
-  }
 }
diff --git a/tests/evals/parallel-runner.ts b/tests/evals/parallel-runner.ts
index 2383db7..0af0074 100644
--- a/tests/evals/parallel-runner.ts
+++ b/tests/evals/parallel-runner.ts
@@ -186,11 +186,7 @@ export class ParallelRunner {
 
     if (lastResult && !lastResult.passed) {
       console.log(`✗ ${scenarioName} FAILED`);
-      if (!this.options.verbose) {
-        this.printFailureDetails(lastResult, false);
-      } else {
-        this.printFailureDetails(lastResult, true);
-      }
+      this.printFailureDetails(lastResult, !!this.options.verbose);
       evalEvents.emitScenarioFail({
         scenario: scenarioName,
         framework: scenario.framework,

From 57047fc458311497749af3568622d0066f35ca08 Mon Sep 17 00:00:00 2001
From: Nick Nisi <nick.nisi@workos.com>
Date: Sat, 14 Feb 2026 15:34:47 -0600
Subject: [PATCH 14/14] chore: formatting

---
 src/lib/agent-interface.ts    | 6 +++++-
 tests/evals/agent-executor.ts | 1 -
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts
index dbe0c99..50be6ff 100644
--- a/src/lib/agent-interface.ts
+++ b/src/lib/agent-interface.ts
@@ -625,7 +625,11 @@ export async function runAgent(
       if (message.type === 'result') {
         resolveCurrentTurn();
       }
-      try { onMessage?.(message); } catch { /* non-critical */ }
+      try {
+        onMessage?.(message);
+      } catch {
+        /* non-critical */
+      }
     }
 
     const durationMs = Date.now() - startTime;
diff --git a/tests/evals/agent-executor.ts b/tests/evals/agent-executor.ts
index 0b244d2..46c2a72 100644
--- a/tests/evals/agent-executor.ts
+++ b/tests/evals/agent-executor.ts
@@ -243,5 +243,4 @@ Begin by invoking the ${skillName} skill.`;
       }
     }
   }
-
 }