From a6283371745e79a0321d974a67aa7071f983a863 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 07:30:02 -0600 Subject: [PATCH 01/14] feat: add quick-checks validation for fast typecheck/build feedback Restructure validation into composable steps so typecheck (~5s) runs independently before full validation. Quick checks short-circuit on typecheck failure and format errors as actionable agent prompts, laying the foundation for the agent retry loop. --- src/lib/agent-runner.ts | 16 +- src/lib/events.ts | 7 + src/lib/validation/build-validator.ts | 6 +- src/lib/validation/index.ts | 12 +- src/lib/validation/quick-checks.spec.ts | 273 +++++++++++++++++++++++ src/lib/validation/quick-checks.ts | 274 ++++++++++++++++++++++++ src/lib/validation/types.ts | 17 ++ src/lib/validation/validator.ts | 39 ++-- 8 files changed, 625 insertions(+), 19 deletions(-) create mode 100644 src/lib/validation/quick-checks.spec.ts create mode 100644 src/lib/validation/quick-checks.ts diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts index b6fcabf..79a453e 100644 --- a/src/lib/agent-runner.ts +++ b/src/lib/agent-runner.ts @@ -1,5 +1,5 @@ import { SPINNER_MESSAGE, type FrameworkConfig } from './framework-config.js'; -import { validateInstallation } from './validation/index.js'; +import { validateInstallation, runQuickChecks } from './validation/index.js'; import type { InstallerOptions } from '../utils/types.js'; import { ensurePackageIsInstalled, @@ -135,10 +135,22 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal // Run post-installation validation if (!options.noValidate) { + // Quick checks: fast typecheck + build before full validation + options.emitter?.emit('validation:quick:start', {}); + + const quickCheckResult = await runQuickChecks(options.installDir); + + options.emitter?.emit('validation:quick:complete', { + passed: quickCheckResult.passed, + results: quickCheckResult.results, + durationMs: quickCheckResult.totalDurationMs, + }); + + // Full validation — skip build since quick checks already ran it options.emitter?.emit('validation:start', { framework: config.metadata.integration }); const validationResult = await validateInstallation(config.metadata.integration, options.installDir, { - runBuild: true, + runBuild: false, }); if (validationResult.issues.length > 0) { diff --git a/src/lib/events.ts b/src/lib/events.ts index cec5cc9..91458cf 100644 --- a/src/lib/events.ts +++ b/src/lib/events.ts @@ -53,6 +53,13 @@ export interface InstallerEvents { 'agent:success': { summary?: string }; 'agent:failure': { message: string; stack?: string }; + 'validation:quick:start': Record; + 'validation:quick:complete': { + passed: boolean; + results: import('./validation/types.js').QuickCheckResult[]; + durationMs: number; + }; + 'validation:start': { framework: string }; 'validation:issues': { issues: import('./validation/types.js').ValidationIssue[] }; 'validation:complete': { passed: boolean; issueCount: number; durationMs: number }; diff --git a/src/lib/validation/build-validator.ts b/src/lib/validation/build-validator.ts index 8debd15..2e5fc9f 100644 --- a/src/lib/validation/build-validator.ts +++ b/src/lib/validation/build-validator.ts @@ -99,13 +99,13 @@ export async function runBuildValidation(projectDir: string, timeoutMs: number = }); } -function detectPackageManager(projectDir: string): 'pnpm' | 'yarn' | 'npm' { +export function detectPackageManager(projectDir: string): 'pnpm' | 'yarn' | 'npm' { if (existsSync(join(projectDir, 'pnpm-lock.yaml'))) return 'pnpm'; if (existsSync(join(projectDir, 'yarn.lock'))) return 'yarn'; return 'npm'; } -async function hasBuildScriptInPackageJson(projectDir: string): Promise { +export async function hasBuildScriptInPackageJson(projectDir: string): Promise { try { const content = await readFile(join(projectDir, 'package.json'), 'utf-8'); const pkg = JSON.parse(content) as { scripts?: { build?: string } }; @@ -115,7 +115,7 @@ async function hasBuildScriptInPackageJson(projectDir: string): Promise } } -function parseBuildErrors(output: string): string[] { +export function parseBuildErrors(output: string): string[] { const errors: string[] = []; // TypeScript errors: "file.ts(line,col): error TS..." diff --git a/src/lib/validation/index.ts b/src/lib/validation/index.ts index 49e74d2..c450b97 100644 --- a/src/lib/validation/index.ts +++ b/src/lib/validation/index.ts @@ -1,5 +1,13 @@ -export { validateInstallation, type ValidateOptions } from './validator.js'; +export { + validateInstallation, + validatePackages, + validateEnvVars, + validateFiles, + validateFrameworkSpecific, + type ValidateOptions, +} from './validator.js'; export { runBuildValidation, type BuildResult } from './build-validator.js'; +export { runQuickChecks, runTypecheckValidation } from './quick-checks.js'; export type { ValidationResult, ValidationRules, @@ -10,4 +18,6 @@ export type { EnvVarRule, FileRule, VariantRules, + QuickCheckResult, + QuickChecksOutput, } from './types.js'; diff --git a/src/lib/validation/quick-checks.spec.ts b/src/lib/validation/quick-checks.spec.ts new file mode 100644 index 0000000..888c2b9 --- /dev/null +++ b/src/lib/validation/quick-checks.spec.ts @@ -0,0 +1,273 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { mkdtempSync, writeFileSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; +import { EventEmitter } from 'node:events'; + +// Mock child_process.spawn to avoid actually running tsc/build +vi.mock('child_process', () => ({ + spawn: vi.fn(), +})); + +import { spawn } from 'child_process'; +import { runQuickChecks, runTypecheckValidation } from './quick-checks.js'; + +const mockSpawn = vi.mocked(spawn); + +/** + * Creates a mock process lazily — must be used inside mockImplementationOnce, + * NOT mockReturnValueOnce, so the setTimeout fires after event listeners attach. + */ +function createMockProcess(exitCode: number, stdout = '', stderr = '') { + const proc = new EventEmitter() as any; + proc.stdout = new EventEmitter(); + proc.stderr = new EventEmitter(); + + setTimeout(() => { + if (stdout) proc.stdout.emit('data', Buffer.from(stdout)); + if (stderr) proc.stderr.emit('data', Buffer.from(stderr)); + proc.emit('close', exitCode); + }, 10); + + return proc; +} + +describe('runQuickChecks', () => { + let testDir: string; + + beforeEach(() => { + testDir = mkdtempSync(join(tmpdir(), 'quick-checks-test-')); + writeFileSync( + join(testDir, 'package.json'), + JSON.stringify({ + scripts: { typecheck: 'tsc --noEmit', build: 'next build' }, + }), + ); + writeFileSync(join(testDir, 'pnpm-lock.yaml'), ''); + mockSpawn.mockReset(); + }); + + afterEach(() => { + rmSync(testDir, { recursive: true, force: true }); + }); + + it('returns passed=true when both typecheck and build succeed', async () => { + mockSpawn + .mockImplementationOnce(() => createMockProcess(0)) + .mockImplementationOnce(() => createMockProcess(0)); + + const result = await runQuickChecks(testDir); + + expect(result.passed).toBe(true); + expect(result.results).toHaveLength(2); + expect(result.results[0].phase).toBe('typecheck'); + expect(result.results[1].phase).toBe('build'); + expect(result.agentRetryPrompt).toBeNull(); + }); + + it('short-circuits build when typecheck fails', async () => { + const tsError = "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable"; + + mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError)); + + const result = await runQuickChecks(testDir); + + expect(result.passed).toBe(false); + expect(result.results).toHaveLength(1); + expect(result.results[0].phase).toBe('typecheck'); + expect(result.results[0].passed).toBe(false); + expect(mockSpawn).toHaveBeenCalledTimes(1); + }); + + it('runs build after typecheck passes', async () => { + mockSpawn + .mockImplementationOnce(() => createMockProcess(0)) + .mockImplementationOnce(() => createMockProcess(0)); + + const result = await runQuickChecks(testDir); + + expect(result.passed).toBe(true); + expect(result.results).toHaveLength(2); + expect(mockSpawn).toHaveBeenCalledTimes(2); + }); + + it('skips build when skipBuild option is true', async () => { + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); + + const result = await runQuickChecks(testDir, { skipBuild: true }); + + expect(result.passed).toBe(true); + expect(result.results).toHaveLength(1); + expect(result.results[0].phase).toBe('typecheck'); + expect(mockSpawn).toHaveBeenCalledTimes(1); + }); + + it('generates agentRetryPrompt when typecheck fails', async () => { + const tsError = "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'."; + mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError)); + + const result = await runQuickChecks(testDir); + + expect(result.agentRetryPrompt).not.toBeNull(); + expect(result.agentRetryPrompt).toContain('typecheck failed'); + expect(result.agentRetryPrompt).toContain('src/middleware.ts'); + }); + + it('tracks total duration', async () => { + mockSpawn + .mockImplementationOnce(() => createMockProcess(0)) + .mockImplementationOnce(() => createMockProcess(0)); + + const result = await runQuickChecks(testDir); + + expect(typeof result.totalDurationMs).toBe('number'); + expect(result.totalDurationMs).toBeGreaterThanOrEqual(0); + }); + + it('reports build failure when typecheck passes but build fails', async () => { + mockSpawn + .mockImplementationOnce(() => createMockProcess(0)) // typecheck pass + .mockImplementationOnce(() => createMockProcess(1, '', 'Error: Build failed')); // build fail + + const result = await runQuickChecks(testDir); + + expect(result.passed).toBe(false); + expect(result.results).toHaveLength(2); + expect(result.results[0].passed).toBe(true); + expect(result.results[1].passed).toBe(false); + expect(result.results[1].phase).toBe('build'); + expect(result.agentRetryPrompt).toContain('build failed'); + }); +}); + +describe('runTypecheckValidation', () => { + let testDir: string; + + beforeEach(() => { + testDir = mkdtempSync(join(tmpdir(), 'typecheck-test-')); + writeFileSync( + join(testDir, 'package.json'), + JSON.stringify({ + scripts: { typecheck: 'tsc --noEmit' }, + }), + ); + writeFileSync(join(testDir, 'pnpm-lock.yaml'), ''); + mockSpawn.mockReset(); + }); + + afterEach(() => { + rmSync(testDir, { recursive: true, force: true }); + }); + + it('returns passed=true when typecheck succeeds', async () => { + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); + + const result = await runTypecheckValidation(testDir); + + expect(result.passed).toBe(true); + expect(result.phase).toBe('typecheck'); + expect(result.issues).toHaveLength(0); + expect(result.agentPrompt).toBeNull(); + }); + + it('parses TypeScript errors from output', async () => { + const tsError = + "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'."; + mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError)); + + const result = await runTypecheckValidation(testDir); + + expect(result.passed).toBe(false); + expect(result.issues.length).toBeGreaterThan(0); + expect(result.issues[0].message).toContain('Type error'); + expect(result.issues[0].severity).toBe('error'); + }); + + it('formats errors into actionable agent prompt', async () => { + const tsError = + "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'."; + mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError)); + + const result = await runTypecheckValidation(testDir); + + expect(result.agentPrompt).not.toBeNull(); + expect(result.agentPrompt).toContain('src/middleware.ts'); + expect(result.agentPrompt).toContain('not assignable'); + }); + + it('handles pretty-printed tsc errors (colon-separated format)', async () => { + const tsError = + "src/app.tsx:10:3 - error TS2322: Type 'number' is not assignable to type 'string'."; + mockSpawn.mockImplementationOnce(() => createMockProcess(1, tsError, '')); + + const result = await runTypecheckValidation(testDir); + + expect(result.passed).toBe(false); + expect(result.issues.length).toBeGreaterThan(0); + }); + + it('provides fallback message when errors cannot be parsed', async () => { + mockSpawn.mockImplementationOnce(() => + createMockProcess(1, '', 'Some unknown error format that we cannot parse'), + ); + + const result = await runTypecheckValidation(testDir); + + expect(result.passed).toBe(false); + expect(result.issues).toHaveLength(1); + expect(result.issues[0].message).toBe('Typecheck failed'); + }); + + it('uses typecheck script from package.json when available', async () => { + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); + + await runTypecheckValidation(testDir); + + expect(mockSpawn).toHaveBeenCalledWith( + 'pnpm', + ['typecheck'], + expect.objectContaining({ cwd: testDir }), + ); + }); + + it('falls back to npx tsc --noEmit when no typecheck script', async () => { + writeFileSync( + join(testDir, 'package.json'), + JSON.stringify({ scripts: { build: 'next build' } }), + ); + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); + + await runTypecheckValidation(testDir); + + expect(mockSpawn).toHaveBeenCalledWith( + 'npx', + ['tsc', '--noEmit'], + expect.objectContaining({ cwd: testDir }), + ); + }); + + it('detects type-check script (hyphenated variant)', async () => { + writeFileSync( + join(testDir, 'package.json'), + JSON.stringify({ scripts: { 'type-check': 'tsc --noEmit' } }), + ); + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); + + await runTypecheckValidation(testDir); + + expect(mockSpawn).toHaveBeenCalledWith( + 'pnpm', + ['type-check'], + expect.objectContaining({ cwd: testDir }), + ); + }); + + it('tracks duration', async () => { + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); + + const result = await runTypecheckValidation(testDir); + + expect(typeof result.durationMs).toBe('number'); + expect(result.durationMs).toBeGreaterThanOrEqual(0); + }); +}); diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts new file mode 100644 index 0000000..1a0b3a2 --- /dev/null +++ b/src/lib/validation/quick-checks.ts @@ -0,0 +1,274 @@ +import { spawn } from 'child_process'; +import { readFile } from 'fs/promises'; +import { join } from 'path'; +import type { QuickCheckResult, QuickChecksOutput, ValidationIssue } from './types.js'; +import { detectPackageManager, parseBuildErrors, runBuildValidation } from './build-validator.js'; + +const DEFAULT_TYPECHECK_TIMEOUT_MS = 30_000; +const DEFAULT_BUILD_TIMEOUT_MS = 60_000; + +/** + * Run fast deterministic checks: typecheck first, then build. + * Short-circuits: if typecheck fails, skip build (build will fail too). + */ +export async function runQuickChecks( + projectDir: string, + options?: { skipBuild?: boolean; timeoutMs?: number }, +): Promise { + const startTime = Date.now(); + const results: QuickCheckResult[] = []; + + // Step 1: Typecheck + const typecheckResult = await runTypecheckValidation( + projectDir, + options?.timeoutMs ?? DEFAULT_TYPECHECK_TIMEOUT_MS, + ); + results.push(typecheckResult); + + // Step 2: Build — only if typecheck passed and build not skipped + if (typecheckResult.passed && !options?.skipBuild) { + const buildResult = await runBuildQuickCheck(projectDir, options?.timeoutMs ?? DEFAULT_BUILD_TIMEOUT_MS); + results.push(buildResult); + } + + const passed = results.every((r) => r.passed); + + return { + passed, + results, + agentRetryPrompt: passed ? null : formatForAgent(results), + totalDurationMs: Date.now() - startTime, + }; +} + +/** + * Run typecheck only (tsc --noEmit or framework equivalent). + * Faster than full build — catches type errors in ~5s. + */ +export async function runTypecheckValidation( + projectDir: string, + timeoutMs: number = DEFAULT_TYPECHECK_TIMEOUT_MS, +): Promise { + const startTime = Date.now(); + const typecheckCmd = await detectTypecheckCommand(projectDir); + + if (!typecheckCmd) { + // No typecheck available — pass through + return { + passed: true, + phase: 'typecheck', + issues: [], + agentPrompt: null, + durationMs: Date.now() - startTime, + }; + } + + const { exitCode, stdout, stderr } = await spawnCommand( + typecheckCmd.command, + typecheckCmd.args, + projectDir, + timeoutMs, + ); + + if (exitCode === 0) { + return { + passed: true, + phase: 'typecheck', + issues: [], + agentPrompt: null, + durationMs: Date.now() - startTime, + }; + } + + const output = stdout + stderr; + const errors = parseTypecheckErrors(output); + const issues: ValidationIssue[] = errors.map((error) => ({ + type: 'file' as const, + severity: 'error' as const, + message: `Type error: ${error}`, + hint: 'Fix the type error and run typecheck again', + })); + + // Fallback if no specific errors parsed + if (issues.length === 0) { + issues.push({ + type: 'file', + severity: 'error', + message: 'Typecheck failed', + hint: `Run \`${typecheckCmd.command} ${typecheckCmd.args.join(' ')}\` to see full output`, + }); + } + + const agentPrompt = formatTypecheckErrors(errors, output); + + return { + passed: false, + phase: 'typecheck', + issues, + agentPrompt, + durationMs: Date.now() - startTime, + }; +} + +/** + * Run build as a quick check, wrapping the existing runBuildValidation. + */ +async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promise { + const buildResult = await runBuildValidation(projectDir, timeoutMs); + + return { + passed: buildResult.success, + phase: 'build', + issues: buildResult.issues, + agentPrompt: buildResult.success ? null : formatBuildErrors(buildResult.issues), + durationMs: buildResult.durationMs, + }; +} + +interface TypecheckCommand { + command: string; + args: string[]; +} + +/** + * Detect the appropriate typecheck command for the project. + * Checks for tsc in node_modules, then framework-specific alternatives. + */ +async function detectTypecheckCommand(projectDir: string): Promise { + const pm = detectPackageManager(projectDir); + + // Check for typecheck script in package.json first + try { + const content = await readFile(join(projectDir, 'package.json'), 'utf-8'); + const pkg = JSON.parse(content) as { scripts?: Record }; + + if (pkg.scripts?.typecheck) { + const args = pm === 'npm' ? ['run', 'typecheck'] : ['typecheck']; + return { command: pm, args }; + } + + if (pkg.scripts?.['type-check']) { + const args = pm === 'npm' ? ['run', 'type-check'] : ['type-check']; + return { command: pm, args }; + } + } catch { + // No package.json or malformed — continue detection + } + + // Fallback: use npx tsc --noEmit + return { command: 'npx', args: ['tsc', '--noEmit'] }; +} + +/** + * Parse TypeScript-specific errors from typecheck output. + */ +function parseTypecheckErrors(output: string): string[] { + const errors: string[] = []; + + // TypeScript errors: "src/file.ts(line,col): error TS2345: ..." + const tsErrors = output.match(/[\w./]+\.\w+\(\d+,\d+\):\s*error\s+TS\d+:.+/g); + if (tsErrors) { + errors.push(...tsErrors.slice(0, 10)); + } + + // Also match "src/file.ts:line:col - error TS2345: ..." (tsc --pretty format) + const prettyErrors = output.match(/[\w./]+\.\w+:\d+:\d+\s*-\s*error\s+TS\d+:.+/g); + if (prettyErrors) { + // Dedupe with existing errors + for (const err of prettyErrors.slice(0, 10)) { + if (!errors.some((e) => e.includes(err.split(':')[0]))) { + errors.push(err); + } + } + } + + return errors.slice(0, 10); +} + +/** + * Format typecheck errors into an agent-ready prompt. + * Turns "TS2345: Argument of type..." into actionable instructions. + */ +function formatTypecheckErrors(errors: string[], rawOutput: string): string { + if (errors.length === 0) { + // Couldn't parse specific errors — give raw output + const truncated = rawOutput.slice(0, 2000); + return `The typecheck failed. Here is the output:\n\n${truncated}\n\nFix the type errors shown above.`; + } + + const lines = errors.map((error) => { + // Extract file:line info and error description + const fileMatch = error.match(/([\w./]+\.\w+)[:(]\d+/); + const tsMatch = error.match(/error\s+(TS\d+):\s*(.+)/); + + if (fileMatch && tsMatch) { + return `- ${fileMatch[1]}: ${tsMatch[2]} (${tsMatch[1]})`; + } + return `- ${error}`; + }); + + return `The typecheck failed with ${errors.length} error${errors.length === 1 ? '' : 's'}:\n\n${lines.join('\n')}\n\nFix these type errors in the indicated files.`; +} + +/** + * Format build errors into an agent-ready prompt. + */ +function formatBuildErrors(issues: ValidationIssue[]): string { + const errorMessages = issues.map((i) => `- ${i.message}`); + return `The build failed:\n\n${errorMessages.join('\n')}\n\nFix these build errors.`; +} + +/** + * Format quick check failures into an agent-ready prompt. + * Combines typecheck and build errors into a single actionable prompt. + */ +function formatForAgent(results: QuickCheckResult[]): string { + const failedResults = results.filter((r) => !r.passed); + if (failedResults.length === 0) return ''; + + const parts: string[] = []; + + for (const result of failedResults) { + if (result.agentPrompt) { + parts.push(result.agentPrompt); + } + } + + return parts.join('\n\n'); +} + +/** + * Spawn a command and collect output. + */ +function spawnCommand( + command: string, + args: string[], + cwd: string, + timeoutMs: number, +): Promise<{ exitCode: number; stdout: string; stderr: string }> { + return new Promise((resolve) => { + const proc = spawn(command, args, { + cwd, + shell: true, + timeout: timeoutMs, + }); + + let stdout = ''; + let stderr = ''; + + proc.stdout?.on('data', (data: Buffer) => { + stdout += data.toString(); + }); + proc.stderr?.on('data', (data: Buffer) => { + stderr += data.toString(); + }); + + proc.on('close', (code) => { + resolve({ exitCode: code ?? 1, stdout, stderr }); + }); + + proc.on('error', () => { + resolve({ exitCode: 1, stdout, stderr }); + }); + }); +} diff --git a/src/lib/validation/types.ts b/src/lib/validation/types.ts index e3675fb..25a5ea5 100644 --- a/src/lib/validation/types.ts +++ b/src/lib/validation/types.ts @@ -47,5 +47,22 @@ export interface ValidationRules { variants?: Record; } +export interface QuickCheckResult { + passed: boolean; + phase: 'typecheck' | 'build'; + issues: ValidationIssue[]; + /** Formatted for agent consumption — actionable, not just error messages */ + agentPrompt: string | null; + durationMs: number; +} + +export interface QuickChecksOutput { + passed: boolean; + results: QuickCheckResult[]; + /** Combined agent-ready prompt summarizing all failures */ + agentRetryPrompt: string | null; + totalDurationMs: number; +} + // Re-export BuildResult from build-validator export type { BuildResult } from './build-validator.js'; diff --git a/src/lib/validation/validator.ts b/src/lib/validation/validator.ts index 772adba..b35a2fa 100644 --- a/src/lib/validation/validator.ts +++ b/src/lib/validation/validator.ts @@ -30,12 +30,12 @@ export async function validateInstallation( } // Run validations - await validatePackages(rules, projectDir, issues); - await validateEnvVars(rules, projectDir, issues); - await validateFiles(rules, projectDir, issues); + issues.push(...await validatePackages(rules, projectDir)); + issues.push(...await validateEnvVars(rules, projectDir)); + issues.push(...await validateFiles(rules, projectDir)); // Run framework-specific cross-validations - await validateFrameworkSpecific(framework, projectDir, issues); + issues.push(...await validateFrameworkSpecific(framework, projectDir)); // Run build validation if enabled if (options.runBuild !== false) { @@ -74,16 +74,17 @@ async function loadRules(framework: string, variant?: string): Promise { +export async function validatePackages(rules: ValidationRules, projectDir: string): Promise { + const issues: ValidationIssue[] = []; const pkgPath = join(projectDir, 'package.json'); - if (!existsSync(pkgPath)) return; + if (!existsSync(pkgPath)) return issues; let pkg: Record; try { pkg = JSON.parse(await readFile(pkgPath, 'utf-8')); } catch { // Malformed package.json - skip package validation - return; + return issues; } const deps = (pkg.dependencies || {}) as Record; @@ -103,9 +104,12 @@ async function validatePackages(rules: ValidationRules, projectDir: string, issu }); } } + + return issues; } -async function validateEnvVars(rules: ValidationRules, projectDir: string, issues: ValidationIssue[]): Promise { +export async function validateEnvVars(rules: ValidationRules, projectDir: string): Promise { + const issues: ValidationIssue[] = []; const envPath = join(projectDir, '.env.local'); let envContent = ''; @@ -120,7 +124,7 @@ async function validateEnvVars(rules: ValidationRules, projectDir: string, issue hint: 'Create .env.local with required environment variables', }); } - return; + return issues; } for (const rule of rules.envVars) { @@ -144,9 +148,13 @@ async function validateEnvVars(rules: ValidationRules, projectDir: string, issue }); } } + + return issues; } -async function validateFiles(rules: ValidationRules, projectDir: string, issues: ValidationIssue[]): Promise { +export async function validateFiles(rules: ValidationRules, projectDir: string): Promise { + const issues: ValidationIssue[] = []; + for (const rule of rules.files) { let matches: string[]; try { @@ -205,16 +213,19 @@ async function validateFiles(rules: ValidationRules, projectDir: string, issues: } } } + + return issues; } /** * Framework-specific cross-validations that require reading multiple sources. */ -async function validateFrameworkSpecific( +export async function validateFrameworkSpecific( framework: string, projectDir: string, - issues: ValidationIssue[], -): Promise { +): Promise { + const issues: ValidationIssue[] = []; + // Universal cross-validations await validateCredentialFormats(projectDir, issues); await validateDuplicateEnvVars(projectDir, issues); @@ -238,6 +249,8 @@ async function validateFrameworkSpecific( await validateCookiePasswordLength(projectDir, issues, 'WORKOS_COOKIE_PASSWORD'); break; } + + return issues; } /** From 8436ed413b852f90bb6fd2b5dcfe90dd83936e1c Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 07:43:59 -0600 Subject: [PATCH 02/14] feat: add retry loop for agent self-correction on validation failures Extend the async generator in agent-interface to yield follow-up correction prompts when quick-checks (typecheck/build) fail. The agent retains full conversation context and gets up to 2 chances to fix its own mistakes before results surface to the user. Configurable via maxRetries option (default 2, 0 to disable). --- src/lib/agent-interface.spec.ts | 306 ++++++++++++++++++++++++++++++++ src/lib/agent-interface.ts | 90 ++++++++-- src/lib/agent-runner.ts | 41 +++-- src/lib/events.ts | 4 + src/utils/types.ts | 7 + 5 files changed, 419 insertions(+), 29 deletions(-) create mode 100644 src/lib/agent-interface.spec.ts diff --git a/src/lib/agent-interface.spec.ts b/src/lib/agent-interface.spec.ts new file mode 100644 index 0000000..d627276 --- /dev/null +++ b/src/lib/agent-interface.spec.ts @@ -0,0 +1,306 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { EventEmitter } from 'node:events'; + +const { mockQuery, mockConfig } = vi.hoisted(() => ({ + mockQuery: vi.fn(), + mockConfig: { + model: 'test-model', + workos: { clientId: 'client_test', authkitDomain: 'test.workos.com', llmGatewayUrl: 'http://localhost:8000' }, + telemetry: { enabled: false, eventName: 'test_event' }, + proxy: { refreshThresholdMs: 300000 }, + nodeVersion: '20', + logging: { debugMode: false }, + documentation: { workosDocsUrl: 'https://workos.com/docs', dashboardUrl: 'https://dashboard.workos.com', issuesUrl: 'https://github.com' }, + frameworks: {}, + legacy: { oauthPort: 3000 }, + branding: { showAsciiArt: false, asciiArt: '', compactAsciiArt: '', useCompact: false }, + }, +})); + +vi.mock('@anthropic-ai/claude-agent-sdk', () => ({ + query: (...args: unknown[]) => mockQuery(...args), +})); + +vi.mock('../utils/debug.js', () => ({ + debug: vi.fn(), + logInfo: vi.fn(), + logWarn: vi.fn(), + logError: vi.fn(), + initLogFile: vi.fn(), + getLogFilePath: vi.fn(() => null), +})); + +vi.mock('../utils/analytics.js', () => ({ + analytics: { + capture: vi.fn(), + setTag: vi.fn(), + shutdown: vi.fn(), + llmRequest: vi.fn(), + incrementAgentIterations: vi.fn(), + toolCalled: vi.fn(), + }, +})); + +vi.mock('./settings.js', () => ({ + getConfig: vi.fn(() => mockConfig), + getAuthkitDomain: vi.fn(() => 'test.workos.com'), + getCliAuthClientId: vi.fn(() => 'client_test'), +})); + +vi.mock('./credentials.js', () => ({ + hasCredentials: vi.fn(() => false), + getCredentials: vi.fn(() => null), +})); + +vi.mock('./token-refresh.js', () => ({ + ensureValidToken: vi.fn(async () => ({ success: true })), +})); + +vi.mock('./credential-proxy.js', () => ({ + startCredentialProxy: vi.fn(), +})); + +vi.mock('../utils/urls.js', () => ({ + getLlmGatewayUrlFromHost: vi.fn(() => 'http://localhost:8000'), +})); + +import { runAgent, type RetryConfig } from './agent-interface.js'; +import { InstallerEventEmitter } from './events.js'; +import type { InstallerOptions } from '../utils/types.js'; + +/** + * Create a mock SDK response that consumes the prompt stream and yields + * responses for each prompt message. This models the real SDK behavior: + * the response generator stays alive as long as prompts keep coming. + */ +function createMockSDKResponse(turns: Array<{ text?: string; error?: boolean }>) { + return function mockQueryImpl({ prompt }: { prompt: AsyncIterable; options: unknown }) { + let turnIndex = 0; + + async function* responseGenerator() { + // Consume each prompt message and respond with the corresponding turn + for await (const _promptMsg of prompt) { + if (turnIndex >= turns.length) continue; + + const turn = turns[turnIndex]; + turnIndex++; + + if (turn.text) { + yield { + type: 'assistant', + message: { + content: [{ type: 'text', text: turn.text }], + usage: { input_tokens: 100, output_tokens: 50 }, + model: 'test-model', + }, + }; + } + + yield { + type: 'result', + subtype: turn.error ? 'error' : 'success', + result: turn.text ?? '', + ...(turn.error ? { errors: ['Test error'] } : {}), + }; + } + } + + return responseGenerator(); + }; +} + +function makeAgentConfig() { + return { + workingDirectory: '/tmp/test', + mcpServers: {}, + model: 'test-model', + allowedTools: [], + sdkEnv: {}, + }; +} + +function makeOptions(overrides: Partial = {}): InstallerOptions { + return { + debug: false, + forceInstall: false, + installDir: '/tmp/test', + local: true, + ci: false, + skipAuth: true, + ...overrides, + }; +} + +describe('runAgent retry loop', () => { + let emitter: InstallerEventEmitter; + let emittedEvents: Array<{ event: string; payload: unknown }>; + + beforeEach(() => { + mockQuery.mockReset(); + emitter = new InstallerEventEmitter(); + emittedEvents = []; + + // Capture all events + const originalEmit = emitter.emit.bind(emitter); + emitter.emit = ((event: string, payload: unknown) => { + emittedEvents.push({ event, payload }); + return originalEmit(event, payload); + }) as typeof emitter.emit; + }); + + it('returns retryCount=0 when no retryConfig provided', async () => { + mockQuery.mockImplementation( + createMockSDKResponse([{ text: 'Done!' }]), + ); + + const result = await runAgent( + makeAgentConfig(), + 'Test prompt', + makeOptions(), + undefined, + emitter, + ); + + expect(result.error).toBeUndefined(); + expect(result.retryCount).toBe(0); + }); + + it('returns retryCount=0 when validation passes first try', async () => { + mockQuery.mockImplementation( + createMockSDKResponse([{ text: 'Done!' }]), + ); + + const validateAndFormat = vi.fn().mockResolvedValue(null); // passes + + const result = await runAgent( + makeAgentConfig(), + 'Test prompt', + makeOptions(), + undefined, + emitter, + { maxRetries: 2, validateAndFormat }, + ); + + expect(result.error).toBeUndefined(); + expect(result.retryCount).toBe(0); + expect(validateAndFormat).toHaveBeenCalledTimes(1); + + // Should emit validation:retry:start and validation:retry:complete + const retryStartEvents = emittedEvents.filter((e) => e.event === 'validation:retry:start'); + const retryCompleteEvents = emittedEvents.filter((e) => e.event === 'validation:retry:complete'); + expect(retryStartEvents).toHaveLength(1); + expect(retryCompleteEvents).toHaveLength(1); + expect(retryCompleteEvents[0].payload).toEqual({ attempt: 1, passed: true }); + + // Should NOT emit agent:retry (no retry happened) + const retryEvents = emittedEvents.filter((e) => e.event === 'agent:retry'); + expect(retryEvents).toHaveLength(0); + }); + + it('retries once when validation fails then passes', async () => { + // Two turns: initial + one retry + mockQuery.mockImplementation( + createMockSDKResponse([ + { text: 'Initial attempt' }, + { text: 'Fixed it!' }, + ]), + ); + + const validateAndFormat = vi.fn() + .mockResolvedValueOnce('Type error in src/foo.ts') // fail first + .mockResolvedValueOnce(null); // pass second + + const result = await runAgent( + makeAgentConfig(), + 'Test prompt', + makeOptions(), + undefined, + emitter, + { maxRetries: 2, validateAndFormat }, + ); + + expect(result.error).toBeUndefined(); + expect(result.retryCount).toBe(1); + expect(validateAndFormat).toHaveBeenCalledTimes(2); + + // Should emit agent:retry once + const retryEvents = emittedEvents.filter((e) => e.event === 'agent:retry'); + expect(retryEvents).toHaveLength(1); + expect(retryEvents[0].payload).toEqual({ attempt: 1, maxRetries: 2 }); + }); + + it('caps at maxRetries when validation always fails', async () => { + // Three turns: initial + 2 retries + mockQuery.mockImplementation( + createMockSDKResponse([ + { text: 'Attempt 1' }, + { text: 'Attempt 2' }, + { text: 'Attempt 3' }, + ]), + ); + + const validateAndFormat = vi.fn().mockResolvedValue('Still broken'); + + const result = await runAgent( + makeAgentConfig(), + 'Test prompt', + makeOptions(), + undefined, + emitter, + { maxRetries: 2, validateAndFormat }, + ); + + expect(result.error).toBeUndefined(); + expect(result.retryCount).toBe(2); + // Called 2 times: after initial + after retry 1 + // NOT called after retry 2 because the loop exits + expect(validateAndFormat).toHaveBeenCalledTimes(2); + + const retryEvents = emittedEvents.filter((e) => e.event === 'agent:retry'); + expect(retryEvents).toHaveLength(2); + }); + + it('preserves existing behavior with maxRetries=0', async () => { + mockQuery.mockImplementation( + createMockSDKResponse([{ text: 'Done!' }]), + ); + + const validateAndFormat = vi.fn().mockResolvedValue('Error'); + + const result = await runAgent( + makeAgentConfig(), + 'Test prompt', + makeOptions(), + undefined, + emitter, + { maxRetries: 0, validateAndFormat }, + ); + + expect(result.error).toBeUndefined(); + expect(result.retryCount).toBe(0); + // validateAndFormat should never be called with maxRetries=0 + expect(validateAndFormat).not.toHaveBeenCalled(); + }); + + it('treats validateAndFormat errors as passed', async () => { + mockQuery.mockImplementation( + createMockSDKResponse([{ text: 'Done!' }]), + ); + + const validateAndFormat = vi.fn().mockRejectedValue(new Error('Validation crashed')); + + const result = await runAgent( + makeAgentConfig(), + 'Test prompt', + makeOptions(), + undefined, + emitter, + { maxRetries: 2, validateAndFormat }, + ); + + expect(result.error).toBeUndefined(); + expect(result.retryCount).toBe(0); + // Should have been called once, threw, treated as passed + expect(validateAndFormat).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts index 9022b3a..856a29a 100644 --- a/src/lib/agent-interface.ts +++ b/src/lib/agent-interface.ts @@ -72,6 +72,13 @@ export type AgentConfig = { workOSApiHost: string; }; +export interface RetryConfig { + /** Max correction attempts after initial run. Default: 2 */ + maxRetries: number; + /** Run between agent turns. Return null if passed, or error prompt if failed. */ + validateAndFormat: (workingDirectory: string) => Promise; +} + /** * Internal configuration object returned by initializeAgent */ @@ -489,7 +496,8 @@ export async function runAgent( errorMessage?: string; }, emitter?: InstallerEventEmitter, -): Promise<{ error?: AgentErrorType; errorMessage?: string }> { + retryConfig?: RetryConfig, +): Promise<{ error?: AgentErrorType; errorMessage?: string; retryCount?: number }> { const { spinnerMessage = 'Setting up WorkOS AuthKit...', successMessage = 'WorkOS AuthKit integration complete', @@ -509,24 +517,73 @@ export async function runAgent( const collectedText: string[] = []; try { - // Workaround for SDK bug: stdin closes before canUseTool responses can be sent. - // The fix is to use an async generator for the prompt that stays open until - // the result is received, keeping the stdin stream alive for permission responses. - // See: https://github.com/anthropics/claude-code/issues/4775 - // See: https://github.com/anthropics/claude-agent-sdk-typescript/issues/41 - let signalDone: () => void; - const resultReceived = new Promise((resolve) => { - signalDone = resolve; - }); + // Retry loop coordination + let retryCount = 0; + const maxRetries = retryConfig?.maxRetries ?? 0; + + // Turn completion signals — the response loop resolves currentTurnDone + // when a 'result' message arrives. The generator awaits it between turns. + let resolveCurrentTurn!: () => void; + let currentTurnDone!: Promise; + + function resetTurnSignal() { + currentTurnDone = new Promise((resolve) => { + resolveCurrentTurn = resolve; + }); + } + resetTurnSignal(); const createPromptStream = async function* () { + // Initial prompt yield { type: 'user', session_id: '', message: { role: 'user', content: prompt }, parent_tool_use_id: null, }; - await resultReceived; + + // Retry loop — yield follow-up correction prompts on validation failure + if (retryConfig && maxRetries > 0) { + while (retryCount < maxRetries) { + // Wait for agent to finish current turn + await currentTurnDone; + + // Run validation between turns + emitter?.emit('validation:retry:start', { attempt: retryCount + 1 }); + + let validationPrompt: string | null; + try { + validationPrompt = await retryConfig.validateAndFormat(agentConfig.workingDirectory); + } catch (err) { + // Don't block on validation bugs — treat as passed + logError('validateAndFormat threw:', err); + validationPrompt = null; + } + + emitter?.emit('validation:retry:complete', { + attempt: retryCount + 1, + passed: validationPrompt === null, + }); + + if (validationPrompt === null) break; // Validation passed + + retryCount++; + emitter?.emit('agent:retry', { attempt: retryCount, maxRetries }); + + resetTurnSignal(); + + // Feed errors back to agent in same conversation + yield { + type: 'user', + session_id: '', + message: { role: 'user', content: validationPrompt }, + parent_tool_use_id: null, + }; + } + } + + // Keep generator alive until the final result is received + await currentTurnDone; }; // Load plugin with bundled skills @@ -570,9 +627,9 @@ export async function runAgent( if (messageError) { sdkError = messageError; } - // Signal completion when result received + // Signal turn completion when result received — this resumes the generator if (message.type === 'result') { - signalDone!(); + resolveCurrentTurn(); } } @@ -597,15 +654,18 @@ export async function runAgent( return { error: AgentErrorType.RESOURCE_MISSING, errorMessage: 'Could not access setup resource' }; } - logInfo(`Agent run completed in ${Math.round(durationMs / 1000)}s`); + logInfo(`Agent run completed in ${Math.round(durationMs / 1000)}s (${retryCount} retries)`); analytics.capture(INSTALLER_INTERACTION_EVENT_NAME, { action: 'agent integration completed', duration_ms: durationMs, duration_seconds: Math.round(durationMs / 1000), + retry_count: retryCount, + max_retries: maxRetries, + passed_after_retry: retryCount > 0, }); // Don't emit agent:success here - let the state machine handle lifecycle events - return {}; + return { retryCount }; } catch (error) { // Don't emit events here - just log and re-throw for state machine to handle logError('Agent run failed:', error); diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts index 79a453e..b2eef7d 100644 --- a/src/lib/agent-runner.ts +++ b/src/lib/agent-runner.ts @@ -9,7 +9,7 @@ import { } from '../utils/clack-utils.js'; import { analytics } from '../utils/analytics.js'; import { INSTALLER_INTERACTION_EVENT_NAME } from './constants.js'; -import { initializeAgent, runAgent } from './agent-interface.js'; +import { initializeAgent, runAgent, type RetryConfig } from './agent-interface.js'; import { uploadEnvironmentVariablesStep } from '../steps/index.js'; import { autoConfigureWorkOSEnvironment } from './workos-management.js'; import { detectPort, getCallbackPath } from './port-detection.js'; @@ -113,7 +113,20 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal options, ); - // Run agent - errors will throw naturally with skill-based approach + // Build validation callback for retry loop — uses quick checks from Phase 1 + const validateAndFormat = async (workingDirectory: string): Promise => { + const quickResult = await runQuickChecks(workingDirectory); + return quickResult.passed ? null : quickResult.agentRetryPrompt; + }; + + // Build retry config + const retryConfig: RetryConfig | undefined = + options.noValidate ? undefined : { + maxRetries: options.maxRetries ?? 2, + validateAndFormat, + }; + + // Run agent with retry support — agent gets correction prompts on validation failure const agentResult = await runAgent( agent, integrationPrompt, @@ -124,6 +137,7 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal errorMessage: 'Integration failed', }, options.emitter, + retryConfig, ); // If agent returned an error, throw so state machine can handle it @@ -133,20 +147,19 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal throw new Error(`Agent SDK error: ${message}`); } - // Run post-installation validation - if (!options.noValidate) { - // Quick checks: fast typecheck + build before full validation - options.emitter?.emit('validation:quick:start', {}); - - const quickCheckResult = await runQuickChecks(options.installDir); - - options.emitter?.emit('validation:quick:complete', { - passed: quickCheckResult.passed, - results: quickCheckResult.results, - durationMs: quickCheckResult.totalDurationMs, + // Track retry metrics + if (agentResult.retryCount !== undefined && agentResult.retryCount > 0) { + analytics.capture(INSTALLER_INTERACTION_EVENT_NAME, { + action: 'agent retry summary', + retry_count: agentResult.retryCount, + max_retries: options.maxRetries ?? 2, + passed_after_retry: true, }); + } - // Full validation — skip build since quick checks already ran it + // Run full validation after agent (with retries) completes + // Quick checks already ran inside the retry loop — skip build + if (!options.noValidate) { options.emitter?.emit('validation:start', { framework: config.metadata.integration }); const validationResult = await validateInstallation(config.metadata.integration, options.installDir, { diff --git a/src/lib/events.ts b/src/lib/events.ts index 91458cf..e0a2279 100644 --- a/src/lib/events.ts +++ b/src/lib/events.ts @@ -52,6 +52,10 @@ export interface InstallerEvents { 'agent:progress': { step: string; detail?: string }; 'agent:success': { summary?: string }; 'agent:failure': { message: string; stack?: string }; + 'agent:retry': { attempt: number; maxRetries: number }; + + 'validation:retry:start': { attempt: number }; + 'validation:retry:complete': { attempt: number; passed: boolean }; 'validation:quick:start': Record; 'validation:quick:complete': { diff --git a/src/utils/types.ts b/src/utils/types.ts index cb54d76..901a05c 100644 --- a/src/utils/types.ts +++ b/src/utils/types.ts @@ -91,6 +91,13 @@ export type InstallerOptions = { * Requires ANTHROPIC_API_KEY environment variable. */ direct?: boolean; + + /** + * Max correction attempts after initial agent run. + * The agent gets this many chances to fix validation failures (typecheck/build). + * Default: 2. Set to 0 to disable retries entirely. + */ + maxRetries?: number; }; export interface Feature { From c0ad5ae3a7a5de43d33a7c38fd56495725523599 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 07:56:24 -0600 Subject: [PATCH 03/14] feat: add within-session correction metrics to evals framework Add retry-aware execution to AgentExecutor using the same async generator + quick-checks pattern from production. Evals now track three tiers: first-attempt, with-correction, and with-retry pass rates. Adds --no-correction flag to disable for baseline comparison. --- tests/evals/agent-executor.ts | 85 +++++++++++++++++++++++++++++++-- tests/evals/cli.ts | 6 +++ tests/evals/index.ts | 1 + tests/evals/parallel-runner.ts | 7 ++- tests/evals/reporter.ts | 3 +- tests/evals/runner.ts | 9 +++- tests/evals/success-criteria.ts | 20 ++++++-- tests/evals/types.ts | 4 ++ 8 files changed, 125 insertions(+), 10 deletions(-) diff --git a/tests/evals/agent-executor.ts b/tests/evals/agent-executor.ts index 3c4b0cd..5a9b135 100644 --- a/tests/evals/agent-executor.ts +++ b/tests/evals/agent-executor.ts @@ -7,6 +7,7 @@ import { writeEnvLocal } from '../../src/lib/env-writer.js'; import { parseEnvFile } from '../../src/utils/env-parser.js'; import { getConfig } from '../../src/lib/settings.js'; import { LatencyTracker } from './latency-tracker.js'; +import { runQuickChecks } from '../../src/lib/validation/quick-checks.js'; import type { ToolCall, LatencyMetrics } from './types.js'; export interface AgentResult { @@ -15,6 +16,17 @@ export interface AgentResult { toolCalls: ToolCall[]; error?: string; latencyMetrics?: LatencyMetrics; + /** Number of within-session correction attempts */ + correctionAttempts: number; + /** Whether the agent self-corrected after an initial failure */ + selfCorrected: boolean; +} + +export interface AgentRetryConfig { + /** Enable within-session correction. Default: true */ + enabled: boolean; + /** Max correction attempts. Default: 2 */ + maxRetries: number; } export interface AgentExecutorOptions { @@ -77,7 +89,8 @@ export class AgentExecutor { this.latencyTracker = new LatencyTracker(); } - async run(): Promise { + async run(retryConfig?: AgentRetryConfig): Promise { + const config = retryConfig ?? { enabled: true, maxRetries: 2 }; const integration = this.getIntegration(); const toolCalls: ToolCall[] = []; const collectedOutput: string[] = []; @@ -106,6 +119,22 @@ export class AgentExecutor { const skillName = SKILL_NAMES[integration]; const prompt = this.buildPrompt(skillName); + // Retry loop coordination + let correctionAttempts = 0; + const maxRetries = config.enabled ? config.maxRetries : 0; + const workDir = this.workDir; + + // Turn completion signals + let resolveCurrentTurn!: () => void; + let currentTurnDone!: Promise; + + function resetTurnSignal() { + currentTurnDone = new Promise((resolve) => { + resolveCurrentTurn = resolve; + }); + } + resetTurnSignal(); + // Initialize and run agent try { const { query } = await import('@anthropic-ai/claude-agent-sdk'); @@ -126,8 +155,51 @@ export class AgentExecutor { const __dirname = path.dirname(__filename); const pluginPath = path.join(__dirname, '../..'); + // Retry-aware prompt stream (same pattern as production agent-interface.ts) + const createPromptStream = async function* () { + yield { + type: 'user', + session_id: '', + message: { role: 'user', content: prompt }, + parent_tool_use_id: null, + }; + + if (maxRetries > 0) { + while (correctionAttempts < maxRetries) { + await currentTurnDone; + + let validationPrompt: string | null; + try { + const quickResult = await runQuickChecks(workDir); + validationPrompt = quickResult.passed ? null : quickResult.agentRetryPrompt; + } catch { + validationPrompt = null; // treat validation errors as passed + } + + if (validationPrompt === null) break; + + correctionAttempts++; + if (label && process.env.EVAL_VERBOSE) { + console.log(`${label} Correction attempt ${correctionAttempts}/${maxRetries}`); + } + + resetTurnSignal(); + + yield { + type: 'user', + session_id: '', + message: { role: 'user', content: validationPrompt }, + parent_tool_use_id: null, + }; + } + } + + // Keep generator alive until final result + await currentTurnDone; + }; + const response = query({ - prompt: prompt, + prompt: createPromptStream(), options: { model: getConfig().model, cwd: this.workDir, @@ -145,9 +217,12 @@ export class AgentExecutor { }, }); - // Process message stream + // Process message stream — signal turn completion on result for await (const message of response) { this.handleMessage(message, toolCalls, collectedOutput, label); + if (message.type === 'result') { + resolveCurrentTurn(); + } } const latencyMetrics = this.latencyTracker.finish(); @@ -156,6 +231,8 @@ export class AgentExecutor { output: collectedOutput.join('\n'), toolCalls, latencyMetrics, + correctionAttempts, + selfCorrected: correctionAttempts > 0, }; } catch (error) { const latencyMetrics = this.latencyTracker.finish(); @@ -165,6 +242,8 @@ export class AgentExecutor { toolCalls, latencyMetrics, error: error instanceof Error ? error.message : String(error), + correctionAttempts, + selfCorrected: false, }; } } diff --git a/tests/evals/cli.ts b/tests/evals/cli.ts index 757a959..12a4cca 100644 --- a/tests/evals/cli.ts +++ b/tests/evals/cli.ts @@ -12,6 +12,7 @@ export interface CliOptions { sequential: boolean; noDashboard: boolean; noFail: boolean; + noCorrection: boolean; quality: boolean; command?: 'run' | 'history' | 'compare' | 'diff' | 'prune' | 'logs' | 'show'; compareIds?: [string, string]; @@ -61,6 +62,7 @@ export function parseArgs(args: string[]): CliOptions { sequential: false, noDashboard: false, noFail: false, + noCorrection: false, quality: false, }; @@ -144,6 +146,8 @@ export function parseArgs(args: string[]): CliOptions { options.noDashboard = true; } else if (arg === '--no-fail') { options.noFail = true; + } else if (arg === '--no-correction') { + options.noCorrection = true; } else if (arg === '--quality' || arg === '-q') { options.quality = true; } @@ -193,6 +197,8 @@ Options: --no-fail Exit 0 even if success criteria thresholds not met + --no-correction Disable within-session agent self-correction retries + --quality, -q Enable LLM-based quality grading (adds cost/time) --json Output results as JSON (for scripting) diff --git a/tests/evals/index.ts b/tests/evals/index.ts index 7e92274..118f3a2 100644 --- a/tests/evals/index.ts +++ b/tests/evals/index.ts @@ -60,6 +60,7 @@ async function main() { noDashboard: options.noDashboard, debug: options.debug, noFail: options.noFail, + noCorrection: options.noCorrection, quality: options.quality, }); diff --git a/tests/evals/parallel-runner.ts b/tests/evals/parallel-runner.ts index 4bf2f35..2383db7 100644 --- a/tests/evals/parallel-runner.ts +++ b/tests/evals/parallel-runner.ts @@ -18,6 +18,7 @@ interface ParallelRunnerOptions { keep?: boolean; keepOnFail?: boolean; concurrency?: number; // Override auto-detection + noCorrection?: boolean; } export class ParallelRunner { @@ -125,7 +126,9 @@ export class ParallelRunner { verbose: this.options.verbose, scenarioName, }); - const agentResult = await executor.run(); + const agentResult = await executor.run( + this.options.noCorrection ? { enabled: false, maxRetries: 0 } : undefined, + ); lastToolCalls = agentResult.toolCalls; const grader = new scenario.grader(workDir); @@ -143,6 +146,8 @@ export class ParallelRunner { attempts: attempt, latencyMetrics: agentResult.latencyMetrics, keyFiles, + correctionAttempts: agentResult.correctionAttempts, + selfCorrected: agentResult.selfCorrected, }; if (gradeResult.passed) { diff --git a/tests/evals/reporter.ts b/tests/evals/reporter.ts index 4cd1b4f..297641e 100644 --- a/tests/evals/reporter.ts +++ b/tests/evals/reporter.ts @@ -59,7 +59,8 @@ export function printMatrix(results: EvalResult[]): void { const passed = results.filter((r) => r.passed).length; const total = results.length; const rate = ((passed / total) * 100).toFixed(1); - console.log(`\nResults: ${passed}/${total} passed (${rate}%)`); + const selfCorrected = results.filter((r) => r.selfCorrected).length; + console.log(`\nResults: ${passed}/${total} passed (${rate}%)${selfCorrected > 0 ? `, ${selfCorrected} self-corrected` : ''}`); if (passed < total) { console.log('\nFailed scenarios:'); diff --git a/tests/evals/runner.ts b/tests/evals/runner.ts index c48db6d..2cadba1 100644 --- a/tests/evals/runner.ts +++ b/tests/evals/runner.ts @@ -98,6 +98,7 @@ export interface ExtendedEvalOptions extends EvalOptions { noDashboard?: boolean; debug?: boolean; noFail?: boolean; + noCorrection?: boolean; quality?: boolean; } @@ -122,6 +123,7 @@ export async function runEvals(options: ExtendedEvalOptions): Promise r.attempts === 1 && r.passed).length; + // First attempt: passed on first scenario attempt with no corrections + const firstAttemptPassed = results.filter( + (r) => r.attempts === 1 && r.passed && (r.correctionAttempts ?? 0) === 0, + ).length; + // With correction: passed on first scenario attempt (may have used within-session correction) + const withCorrectionPassed = results.filter((r) => r.attempts === 1 && r.passed).length; const totalPassed = results.filter((r) => r.passed).length; const firstAttemptRate = results.length > 0 ? firstAttemptPassed / results.length : 0; + const withCorrectionRate = results.length > 0 ? withCorrectionPassed / results.length : 0; const withRetryRate = results.length > 0 ? totalPassed / results.length : 0; const failures: string[] = []; @@ -46,6 +55,11 @@ export function validateResults(results: EvalResult[], criteria: SuccessCriteria `First-attempt pass rate ${(firstAttemptRate * 100).toFixed(1)}% < ${criteria.firstAttemptPassRate * 100}% required`, ); } + if (criteria.withCorrectionPassRate !== undefined && withCorrectionRate < criteria.withCorrectionPassRate) { + failures.push( + `With-correction pass rate ${(withCorrectionRate * 100).toFixed(1)}% < ${criteria.withCorrectionPassRate * 100}% required`, + ); + } if (withRetryRate < criteria.withRetryPassRate) { failures.push( `With-retry pass rate ${(withRetryRate * 100).toFixed(1)}% < ${criteria.withRetryPassRate * 100}% required`, @@ -55,7 +69,7 @@ export function validateResults(results: EvalResult[], criteria: SuccessCriteria return { passed: failures.length === 0, criteria, - actual: { firstAttemptPassRate: firstAttemptRate, withRetryPassRate: withRetryRate }, + actual: { firstAttemptPassRate: firstAttemptRate, withCorrectionPassRate: withCorrectionRate, withRetryPassRate: withRetryRate }, failures, }; } diff --git a/tests/evals/types.ts b/tests/evals/types.ts index 3f626d2..9891597 100644 --- a/tests/evals/types.ts +++ b/tests/evals/types.ts @@ -27,6 +27,10 @@ export interface EvalResult { qualityGrade?: QualityGrade; /** Key integration files for quality grading (replaces raw diff) */ keyFiles?: Map; + /** Within-session correction attempts (0 = passed first try) */ + correctionAttempts?: number; + /** Agent self-corrected after initial failure */ + selfCorrected?: boolean; } /** Input for quality grading - structured data instead of raw diff */ From 03984c060b11a7281ab2c4691f17cd41f416e95e Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 08:34:56 -0600 Subject: [PATCH 04/14] refactor: unify eval executor with production runAgent path AgentExecutor now delegates to the production runAgent instead of reimplementing the retry-aware async generator. Exports AgentRunConfig so evals can construct it directly, adds onMessage hook for latency tracking. Includes 13 tests verifying the wiring. --- src/lib/agent-interface.ts | 8 +- tests/evals/__tests__/agent-executor.spec.ts | 256 +++++++++++++++++++ tests/evals/agent-executor.ts | 185 ++++++-------- 3 files changed, 336 insertions(+), 113 deletions(-) create mode 100644 tests/evals/__tests__/agent-executor.spec.ts diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts index 856a29a..5b0018c 100644 --- a/src/lib/agent-interface.ts +++ b/src/lib/agent-interface.ts @@ -80,9 +80,10 @@ export interface RetryConfig { } /** - * Internal configuration object returned by initializeAgent + * Configuration object for running the agent. + * Built by initializeAgent (production) or constructed directly (evals). */ -type AgentRunConfig = { +export type AgentRunConfig = { workingDirectory: string; mcpServers: McpServersConfig; model: string; @@ -497,6 +498,7 @@ export async function runAgent( }, emitter?: InstallerEventEmitter, retryConfig?: RetryConfig, + onMessage?: (message: SDKMessage) => void, ): Promise<{ error?: AgentErrorType; errorMessage?: string; retryCount?: number }> { const { spinnerMessage = 'Setting up WorkOS AuthKit...', @@ -631,6 +633,8 @@ export async function runAgent( if (message.type === 'result') { resolveCurrentTurn(); } + // Let callers observe messages (e.g., for latency tracking in evals) + try { onMessage?.(message); } catch { /* observer errors are non-critical */ } } const durationMs = Date.now() - startTime; diff --git a/tests/evals/__tests__/agent-executor.spec.ts b/tests/evals/__tests__/agent-executor.spec.ts new file mode 100644 index 0000000..2ca23bc --- /dev/null +++ b/tests/evals/__tests__/agent-executor.spec.ts @@ -0,0 +1,256 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { mkdtempSync, writeFileSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; + +// Hoist mocks so they're available in vi.mock factories +const { mockRunAgent, mockConfig, mockCredentials } = vi.hoisted(() => ({ + mockRunAgent: vi.fn(), + mockConfig: { + model: 'test-model', + workos: { clientId: 'client_test', authkitDomain: 'test.workos.com', llmGatewayUrl: 'http://localhost:8000' }, + telemetry: { enabled: false, eventName: 'test_event' }, + proxy: { refreshThresholdMs: 300000 }, + nodeVersion: '20', + logging: { debugMode: false }, + documentation: { workosDocsUrl: 'https://workos.com/docs', dashboardUrl: 'https://dashboard.workos.com', issuesUrl: 'https://github.com' }, + frameworks: {}, + legacy: { oauthPort: 3000 }, + branding: { showAsciiArt: false, asciiArt: '', compactAsciiArt: '', useCompact: false }, + }, + mockCredentials: { + workosApiKey: 'sk_test_key', + workosClientId: 'client_test_id', + anthropicApiKey: 'sk-ant-test', + }, +})); + +// Mock the production runAgent — this is what we're testing the wiring to +vi.mock('../../../src/lib/agent-interface.js', () => ({ + runAgent: mockRunAgent, +})); + +// Mock dependencies +vi.mock('../env-loader.js', () => ({ + loadCredentials: vi.fn(() => mockCredentials), +})); + +vi.mock('../../../src/lib/env-writer.js', () => ({ + writeEnvLocal: vi.fn(), +})); + +vi.mock('../../../src/utils/env-parser.js', () => ({ + parseEnvFile: vi.fn(() => ({})), +})); + +vi.mock('../../../src/lib/settings.js', () => ({ + getConfig: vi.fn(() => mockConfig), +})); + +vi.mock('../../../src/lib/validation/quick-checks.js', () => ({ + runQuickChecks: vi.fn(), +})); + +// Mock debug/analytics that agent-interface transitively imports +vi.mock('../../../src/utils/debug.js', () => ({ + debug: vi.fn(), + logInfo: vi.fn(), + logWarn: vi.fn(), + logError: vi.fn(), + initLogFile: vi.fn(), + getLogFilePath: vi.fn(() => null), +})); + +vi.mock('../../../src/utils/analytics.js', () => ({ + analytics: { + capture: vi.fn(), + setTag: vi.fn(), + shutdown: vi.fn(), + llmRequest: vi.fn(), + incrementAgentIterations: vi.fn(), + toolCalled: vi.fn(), + }, +})); + +import { AgentExecutor } from '../agent-executor.js'; +import { writeEnvLocal } from '../../../src/lib/env-writer.js'; + +describe('AgentExecutor', () => { + let testDir: string; + + beforeEach(() => { + testDir = mkdtempSync(join(tmpdir(), 'agent-executor-test-')); + // Create package.json so env writing works + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ name: 'test' })); + mockRunAgent.mockReset(); + }); + + afterEach(() => { + rmSync(testDir, { recursive: true, force: true }); + }); + + it('calls production runAgent with correct AgentRunConfig', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run(); + + expect(mockRunAgent).toHaveBeenCalledTimes(1); + + const [agentRunConfig] = mockRunAgent.mock.calls[0]; + expect(agentRunConfig.workingDirectory).toBe(testDir); + expect(agentRunConfig.model).toBe('test-model'); + expect(agentRunConfig.allowedTools).toContain('Skill'); + expect(agentRunConfig.allowedTools).toContain('Write'); + expect(agentRunConfig.mcpServers).toHaveProperty('workos'); + // Direct mode — no gateway URL + expect(agentRunConfig.sdkEnv.ANTHROPIC_API_KEY).toBe('sk-ant-test'); + expect(agentRunConfig.sdkEnv.ANTHROPIC_BASE_URL).toBeUndefined(); + }); + + it('passes RetryConfig when correction is enabled', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run({ enabled: true, maxRetries: 3 }); + + const retryConfig = mockRunAgent.mock.calls[0][5]; // 6th arg + expect(retryConfig).toBeDefined(); + expect(retryConfig.maxRetries).toBe(3); + expect(typeof retryConfig.validateAndFormat).toBe('function'); + }); + + it('passes no RetryConfig when correction is disabled', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run({ enabled: false, maxRetries: 0 }); + + const retryConfig = mockRunAgent.mock.calls[0][5]; + expect(retryConfig).toBeUndefined(); + }); + + it('passes InstallerOptions with skipAuth=true', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run(); + + const installerOptions = mockRunAgent.mock.calls[0][2]; // 3rd arg + expect(installerOptions.skipAuth).toBe(true); + expect(installerOptions.installDir).toBe(testDir); + }); + + it('passes onMessage callback as 7th argument', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run(); + + const onMessage = mockRunAgent.mock.calls[0][6]; // 7th arg + expect(typeof onMessage).toBe('function'); + }); + + it('maps retryCount=0 to correctionAttempts=0, selfCorrected=false', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + const result = await executor.run(); + + expect(result.success).toBe(true); + expect(result.correctionAttempts).toBe(0); + expect(result.selfCorrected).toBe(false); + }); + + it('maps retryCount>0 to selfCorrected=true on success', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 2 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + const result = await executor.run(); + + expect(result.success).toBe(true); + expect(result.correctionAttempts).toBe(2); + expect(result.selfCorrected).toBe(true); + }); + + it('maps runAgent error result to failed AgentResult', async () => { + mockRunAgent.mockResolvedValue({ + error: 'EXECUTION_ERROR', + errorMessage: 'SDK crashed', + retryCount: 1, + }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + const result = await executor.run(); + + expect(result.success).toBe(false); + expect(result.error).toBe('SDK crashed'); + expect(result.correctionAttempts).toBe(1); + expect(result.selfCorrected).toBe(false); + }); + + it('handles runAgent throwing an exception', async () => { + mockRunAgent.mockRejectedValue(new Error('Connection refused')); + + const executor = new AgentExecutor(testDir, 'nextjs'); + const result = await executor.run(); + + expect(result.success).toBe(false); + expect(result.error).toBe('Connection refused'); + expect(result.correctionAttempts).toBe(0); + }); + + it('writes env vars before calling runAgent', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run(); + + expect(writeEnvLocal).toHaveBeenCalledWith(testDir, { + WORKOS_API_KEY: 'sk_test_key', + WORKOS_CLIENT_ID: 'client_test_id', + }); + }); + + it('onMessage callback collects text output from assistant messages', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run(); + + // Get the onMessage callback and simulate a message + const onMessage = mockRunAgent.mock.calls[0][6]; + onMessage({ + type: 'assistant', + message: { + content: [{ type: 'text', text: 'Installing AuthKit...' }], + }, + }); + + // Run again to verify output is collected (can't check internal state, + // but we can verify it doesn't throw) + expect(onMessage).toBeDefined(); + }); + + it('builds prompt with correct skill name for framework', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'react-router'); + await executor.run(); + + const prompt = mockRunAgent.mock.calls[0][1]; // 2nd arg + expect(prompt).toContain('workos-authkit-react-router'); + expect(prompt).toContain('react-router'); + }); + + it('defaults to correction enabled with maxRetries=2', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run(); // no retryConfig arg — uses default + + const retryConfig = mockRunAgent.mock.calls[0][5]; + expect(retryConfig).toBeDefined(); + expect(retryConfig.maxRetries).toBe(2); + }); +}); diff --git a/tests/evals/agent-executor.ts b/tests/evals/agent-executor.ts index 5a9b135..73c7e98 100644 --- a/tests/evals/agent-executor.ts +++ b/tests/evals/agent-executor.ts @@ -1,13 +1,13 @@ -import path from 'node:path'; import { writeFileSync, existsSync, readFileSync } from 'node:fs'; import { join } from 'node:path'; -import { fileURLToPath } from 'node:url'; import { loadCredentials } from './env-loader.js'; import { writeEnvLocal } from '../../src/lib/env-writer.js'; import { parseEnvFile } from '../../src/utils/env-parser.js'; import { getConfig } from '../../src/lib/settings.js'; import { LatencyTracker } from './latency-tracker.js'; import { runQuickChecks } from '../../src/lib/validation/quick-checks.js'; +import { runAgent, type AgentRunConfig, type RetryConfig } from '../../src/lib/agent-interface.js'; +import type { InstallerOptions } from '../../src/utils/types.js'; import type { ToolCall, LatencyMetrics } from './types.js'; export interface AgentResult { @@ -119,113 +119,78 @@ export class AgentExecutor { const skillName = SKILL_NAMES[integration]; const prompt = this.buildPrompt(skillName); - // Retry loop coordination - let correctionAttempts = 0; - const maxRetries = config.enabled ? config.maxRetries : 0; - const workDir = this.workDir; + // Build SDK environment for direct mode + const sdkEnv: Record = { + ...process.env, + ANTHROPIC_API_KEY: this.credentials.anthropicApiKey, + CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: 'true', + CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: 'true', + }; + delete sdkEnv.ANTHROPIC_BASE_URL; + delete sdkEnv.ANTHROPIC_AUTH_TOKEN; + + // Construct AgentRunConfig directly (bypasses initializeAgent/gateway auth) + const agentRunConfig: AgentRunConfig = { + workingDirectory: this.workDir, + mcpServers: { + workos: { + command: 'npx', + args: ['-y', '@workos/mcp-docs-server'], + }, + }, + model: getConfig().model, + allowedTools: ['Skill', 'Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep', 'WebFetch'], + sdkEnv, + }; - // Turn completion signals - let resolveCurrentTurn!: () => void; - let currentTurnDone!: Promise; + // Thin InstallerOptions — only what runAgent needs + const installerOptions: InstallerOptions = { + debug: this.options.verbose ?? false, + forceInstall: false, + installDir: this.workDir, + local: false, + ci: true, + skipAuth: true, + }; - function resetTurnSignal() { - currentTurnDone = new Promise((resolve) => { - resolveCurrentTurn = resolve; - }); - } - resetTurnSignal(); + // Build production RetryConfig with validateAndFormat callback + const prodRetryConfig: RetryConfig | undefined = config.enabled + ? { + maxRetries: config.maxRetries, + validateAndFormat: async (workingDirectory: string): Promise => { + const quickResult = await runQuickChecks(workingDirectory); + return quickResult.passed ? null : quickResult.agentRetryPrompt; + }, + } + : undefined; - // Initialize and run agent try { - const { query } = await import('@anthropic-ai/claude-agent-sdk'); + // Delegate to production runAgent — same retry loop, same generator coordination + const result = await runAgent( + agentRunConfig, + prompt, + installerOptions, + undefined, // no spinner config + undefined, // no emitter + prodRetryConfig, + (message) => this.trackMessage(message, toolCalls, collectedOutput, label), + ); - // Build SDK environment for direct mode - const sdkEnv: Record = { - ...process.env, - ANTHROPIC_API_KEY: this.credentials.anthropicApiKey, - CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: 'true', - CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: 'true', - }; - // Remove gateway config to use direct API - delete sdkEnv.ANTHROPIC_BASE_URL; - delete sdkEnv.ANTHROPIC_AUTH_TOKEN; - - // Get plugin path for skills - const __filename = fileURLToPath(import.meta.url); - const __dirname = path.dirname(__filename); - const pluginPath = path.join(__dirname, '../..'); - - // Retry-aware prompt stream (same pattern as production agent-interface.ts) - const createPromptStream = async function* () { - yield { - type: 'user', - session_id: '', - message: { role: 'user', content: prompt }, - parent_tool_use_id: null, + const latencyMetrics = this.latencyTracker.finish(); + const correctionAttempts = result.retryCount ?? 0; + + if (result.error) { + return { + success: false, + output: collectedOutput.join('\n'), + toolCalls, + latencyMetrics, + error: result.errorMessage ?? String(result.error), + correctionAttempts, + selfCorrected: false, }; - - if (maxRetries > 0) { - while (correctionAttempts < maxRetries) { - await currentTurnDone; - - let validationPrompt: string | null; - try { - const quickResult = await runQuickChecks(workDir); - validationPrompt = quickResult.passed ? null : quickResult.agentRetryPrompt; - } catch { - validationPrompt = null; // treat validation errors as passed - } - - if (validationPrompt === null) break; - - correctionAttempts++; - if (label && process.env.EVAL_VERBOSE) { - console.log(`${label} Correction attempt ${correctionAttempts}/${maxRetries}`); - } - - resetTurnSignal(); - - yield { - type: 'user', - session_id: '', - message: { role: 'user', content: validationPrompt }, - parent_tool_use_id: null, - }; - } - } - - // Keep generator alive until final result - await currentTurnDone; - }; - - const response = query({ - prompt: createPromptStream(), - options: { - model: getConfig().model, - cwd: this.workDir, - permissionMode: 'acceptEdits', - mcpServers: { - workos: { - command: 'npx', - args: ['-y', '@workos/mcp-docs-server'], - }, - }, - env: sdkEnv, - tools: { type: 'preset', preset: 'claude_code' }, - allowedTools: ['Skill', 'Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep', 'WebFetch'], - plugins: [{ type: 'local', path: pluginPath }], - }, - }); - - // Process message stream — signal turn completion on result - for await (const message of response) { - this.handleMessage(message, toolCalls, collectedOutput, label); - if (message.type === 'result') { - resolveCurrentTurn(); - } } - const latencyMetrics = this.latencyTracker.finish(); return { success: true, output: collectedOutput.join('\n'), @@ -242,7 +207,7 @@ export class AgentExecutor { toolCalls, latencyMetrics, error: error instanceof Error ? error.message : String(error), - correctionAttempts, + correctionAttempts: 0, selfCorrected: false, }; } @@ -266,15 +231,17 @@ Use the \`${skillName}\` skill to integrate WorkOS AuthKit into this application Begin by invoking the ${skillName} skill.`; } - private handleMessage(message: any, toolCalls: ToolCall[], collectedOutput: string[], label: string): void { + /** + * Observe SDK messages for latency tracking and output collection. + * This is called via the onMessage hook — production handleSDKMessage runs first. + */ + private trackMessage(message: any, toolCalls: ToolCall[], collectedOutput: string[], label: string): void { if (message.type === 'assistant') { - // End any in-progress tool call when we get a new assistant message this.latencyTracker.endToolCall(); const content = message.message?.content; if (Array.isArray(content)) { for (const block of content) { - // Capture text output and track TTFT if (block.type === 'text' && typeof block.text === 'string') { this.latencyTracker.recordFirstContent(); collectedOutput.push(block.text); @@ -282,14 +249,12 @@ Begin by invoking the ${skillName} skill.`; console.log(`${label} Agent: ${block.text.slice(0, 100)}...`); } } - // Capture tool calls and start timing if (block.type === 'tool_use') { this.latencyTracker.startToolCall(block.name); - const call: ToolCall = { + toolCalls.push({ tool: block.name, input: block.input as Record, - }; - toolCalls.push(call); + }); if (this.options.verbose) { console.log(`${label} Tool: ${block.name}`); } @@ -299,7 +264,6 @@ Begin by invoking the ${skillName} skill.`; } if (message.type === 'result') { - // Capture token usage from result if (message.usage) { this.latencyTracker.recordTokens(message.usage.input_tokens ?? 0, message.usage.output_tokens ?? 0); } @@ -310,7 +274,6 @@ Begin by invoking the ${skillName} skill.`; } private getIntegration(): string { - // Integration is now a string type — framework name IS the integration name return this.framework; } } From 81a374e39ec3d5d6132ee5bcf78d9a3e00df3380 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 08:41:09 -0600 Subject: [PATCH 05/14] fix: recalibrate success criteria thresholds for correction-aware metrics First-attempt now means zero corrections, which is stricter than before. Lower threshold to 30% (aspirational), add withCorrectionPassRate at 90% as the primary quality gate, keep withRetryPassRate at 95%. --- tests/evals/success-criteria.spec.ts | 69 ++++++++++++++++++---------- tests/evals/success-criteria.ts | 3 +- 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/tests/evals/success-criteria.spec.ts b/tests/evals/success-criteria.spec.ts index ba62604..24fc969 100644 --- a/tests/evals/success-criteria.spec.ts +++ b/tests/evals/success-criteria.spec.ts @@ -2,30 +2,35 @@ import { describe, it, expect } from 'vitest'; import { validateResults, DEFAULT_CRITERIA, type SuccessCriteria } from './success-criteria.js'; import type { EvalResult } from './types.js'; -function makeResult(passed: boolean, attempts: number = 1): EvalResult { +function makeResult(passed: boolean, attempts: number = 1, correctionAttempts: number = 0): EvalResult { return { scenario: `test-${Math.random().toString(36).slice(2)}`, passed, duration: 1000, attempts, + correctionAttempts, }; } describe('success-criteria', () => { describe('DEFAULT_CRITERIA', () => { it('has expected default thresholds', () => { - expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.9); + expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.3); + expect(DEFAULT_CRITERIA.withCorrectionPassRate).toBe(0.9); expect(DEFAULT_CRITERIA.withRetryPassRate).toBe(0.95); }); }); describe('validateResults', () => { it('returns passed=true when all criteria met', () => { - // 10 results, 9 passed on first attempt, 1 passed on retry + // 10 results: 4 clean (40% > 30%), 5 corrected (9/10 = 90% correction), 1 retried (100% retry) const results: EvalResult[] = [ - ...Array(9) + ...Array(4) .fill(null) - .map(() => makeResult(true, 1)), + .map(() => makeResult(true, 1, 0)), + ...Array(5) + .fill(null) + .map(() => makeResult(true, 1, 1)), makeResult(true, 2), ]; @@ -33,34 +38,38 @@ describe('success-criteria', () => { expect(validation.passed).toBe(true); expect(validation.failures).toHaveLength(0); - expect(validation.actual.firstAttemptPassRate).toBe(0.9); + expect(validation.actual.firstAttemptPassRate).toBe(0.4); + expect(validation.actual.withCorrectionPassRate).toBe(0.9); expect(validation.actual.withRetryPassRate).toBe(1); }); it('returns passed=false when first-attempt rate below threshold', () => { - // 10 results, only 8 passed on first attempt + // 10 results, only 2 passed on first attempt (20% < 30% threshold) const results: EvalResult[] = [ - ...Array(8) + ...Array(2) .fill(null) .map(() => makeResult(true, 1)), - makeResult(true, 2), + ...Array(7) + .fill(null) + .map(() => makeResult(true, 2)), makeResult(true, 2), ]; const validation = validateResults(results); expect(validation.passed).toBe(false); - expect(validation.failures).toHaveLength(1); - expect(validation.failures[0]).toContain('First-attempt'); - expect(validation.failures[0]).toContain('80.0%'); + expect(validation.failures.some((f) => f.includes('First-attempt'))).toBe(true); }); it('returns passed=false when with-retry rate below threshold', () => { - // 10 results, 9 passed first attempt, 1 failed entirely + // 10 results: 4 clean, 5 corrected (90% correction), 1 failed → 90% retry < 95% const results: EvalResult[] = [ - ...Array(9) + ...Array(4) .fill(null) - .map(() => makeResult(true, 1)), + .map(() => makeResult(true, 1, 0)), + ...Array(5) + .fill(null) + .map(() => makeResult(true, 1, 1)), makeResult(false, 3), ]; @@ -71,21 +80,24 @@ describe('success-criteria', () => { expect(validation.failures[0]).toContain('With-retry'); }); - it('returns both failures when both criteria not met', () => { - // 10 results, 7 passed first attempt, 1 failed + it('returns both failures when multiple criteria not met', () => { + // 10 results, 2 passed first attempt (20% < 30%), 4 failed entirely (60% < 95% retry) const results: EvalResult[] = [ - ...Array(7) + ...Array(2) .fill(null) .map(() => makeResult(true, 1)), - makeResult(true, 2), - makeResult(true, 2), - makeResult(false, 3), + ...Array(4) + .fill(null) + .map(() => makeResult(true, 2)), + ...Array(4) + .fill(null) + .map(() => makeResult(false, 3)), ]; const validation = validateResults(results); expect(validation.passed).toBe(false); - expect(validation.failures).toHaveLength(2); + expect(validation.failures.length).toBeGreaterThanOrEqual(2); }); it('handles empty results array', () => { @@ -120,11 +132,18 @@ describe('success-criteria', () => { }); it('passes when exactly at threshold', () => { - // Exactly 90% first-attempt, 95% with-retry + // 20 results: + // 6 clean first-attempt (attempt=1, corrections=0) → 30% first-attempt + // 12 self-corrected (attempt=1, corrections=1) → 18/20 = 90% with-correction + // 1 passed on scenario retry (attempt=2) → 19/20 = 95% with-retry + // 1 failed (attempt=3) const results: EvalResult[] = [ - ...Array(18) + ...Array(6) .fill(null) - .map(() => makeResult(true, 1)), + .map(() => makeResult(true, 1, 0)), + ...Array(12) + .fill(null) + .map(() => makeResult(true, 1, 1)), makeResult(true, 2), makeResult(false, 3), ]; diff --git a/tests/evals/success-criteria.ts b/tests/evals/success-criteria.ts index e25752b..57f1ed4 100644 --- a/tests/evals/success-criteria.ts +++ b/tests/evals/success-criteria.ts @@ -17,7 +17,8 @@ export interface SuccessCriteria { /** Default thresholds for CI enforcement */ export const DEFAULT_CRITERIA: SuccessCriteria = { - firstAttemptPassRate: 0.9, + firstAttemptPassRate: 0.3, + withCorrectionPassRate: 0.9, withRetryPassRate: 0.95, }; From 61ee472ebdf9343565898c6a5dbf11f449023cb3 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 08:42:12 -0600 Subject: [PATCH 06/14] chore: disable dotnet eval scenario (broken SDK, no runtime) --- tests/evals/runner.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/evals/runner.ts b/tests/evals/runner.ts index 2cadba1..5375ea3 100644 --- a/tests/evals/runner.ts +++ b/tests/evals/runner.ts @@ -87,8 +87,8 @@ const SCENARIOS: Scenario[] = [ { framework: 'elixir', state: 'example', grader: ElixirGrader }, { framework: 'elixir', state: 'example-auth0', grader: ElixirGrader }, - // .NET (broken — no runtime) - { framework: 'dotnet', state: 'example', grader: DotnetGrader }, + // .NET (disabled — SDK is broken and no runtime available on most machines) + // { framework: 'dotnet', state: 'example', grader: DotnetGrader }, ]; export interface ExtendedEvalOptions extends EvalOptions { From f891dfeb5efea20c2cd6c3d6c96c5f8dd1573437 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 08:53:49 -0600 Subject: [PATCH 07/14] fix: lower first-attempt threshold to 20% to match observed baseline Two eval runs show ~21-27% first-attempt rate. The correction loop consistently brings it to 93-100%. Set threshold at 20% to catch regressions without failing on normal variance. --- tests/evals/success-criteria.spec.ts | 12 +++++------- tests/evals/success-criteria.ts | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/evals/success-criteria.spec.ts b/tests/evals/success-criteria.spec.ts index 24fc969..7a8dae9 100644 --- a/tests/evals/success-criteria.spec.ts +++ b/tests/evals/success-criteria.spec.ts @@ -15,7 +15,7 @@ function makeResult(passed: boolean, attempts: number = 1, correctionAttempts: n describe('success-criteria', () => { describe('DEFAULT_CRITERIA', () => { it('has expected default thresholds', () => { - expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.3); + expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.2); expect(DEFAULT_CRITERIA.withCorrectionPassRate).toBe(0.9); expect(DEFAULT_CRITERIA.withRetryPassRate).toBe(0.95); }); @@ -44,14 +44,12 @@ describe('success-criteria', () => { }); it('returns passed=false when first-attempt rate below threshold', () => { - // 10 results, only 2 passed on first attempt (20% < 30% threshold) + // 10 results, only 1 passed on first attempt (10% < 20% threshold) const results: EvalResult[] = [ - ...Array(2) - .fill(null) - .map(() => makeResult(true, 1)), - ...Array(7) + makeResult(true, 1, 0), + ...Array(8) .fill(null) - .map(() => makeResult(true, 2)), + .map(() => makeResult(true, 1, 1)), makeResult(true, 2), ]; diff --git a/tests/evals/success-criteria.ts b/tests/evals/success-criteria.ts index 57f1ed4..110ce51 100644 --- a/tests/evals/success-criteria.ts +++ b/tests/evals/success-criteria.ts @@ -17,7 +17,7 @@ export interface SuccessCriteria { /** Default thresholds for CI enforcement */ export const DEFAULT_CRITERIA: SuccessCriteria = { - firstAttemptPassRate: 0.3, + firstAttemptPassRate: 0.2, withCorrectionPassRate: 0.9, withRetryPassRate: 0.95, }; From 46f33bfddf01c2f30e0fc05e734de115db5e08c8 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 08:56:39 -0600 Subject: [PATCH 08/14] fix: skip typecheck on non-TypeScript projects, raise first-attempt threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit detectTypecheckCommand was falling back to npx tsc --noEmit for every project including Python, Ruby, Go, etc. Now checks for tsconfig.json before falling back — no tsconfig means skip typecheck entirely. This eliminates false correction triggers on non-JS frameworks. Raises first-attempt threshold to 50% since the false positives were the main driver of the low rate. --- src/lib/validation/quick-checks.spec.ts | 17 ++++++++++- src/lib/validation/quick-checks.ts | 10 +++++-- tests/evals/success-criteria.spec.ts | 38 +++++++++++++------------ tests/evals/success-criteria.ts | 2 +- 4 files changed, 45 insertions(+), 22 deletions(-) diff --git a/src/lib/validation/quick-checks.spec.ts b/src/lib/validation/quick-checks.spec.ts index 888c2b9..4b4cf8c 100644 --- a/src/lib/validation/quick-checks.spec.ts +++ b/src/lib/validation/quick-checks.spec.ts @@ -230,11 +230,12 @@ describe('runTypecheckValidation', () => { ); }); - it('falls back to npx tsc --noEmit when no typecheck script', async () => { + it('falls back to npx tsc --noEmit when no typecheck script but tsconfig exists', async () => { writeFileSync( join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'next build' } }), ); + writeFileSync(join(testDir, 'tsconfig.json'), '{}'); mockSpawn.mockImplementationOnce(() => createMockProcess(0)); await runTypecheckValidation(testDir); @@ -246,6 +247,20 @@ describe('runTypecheckValidation', () => { ); }); + it('skips typecheck when no tsconfig.json and no typecheck script', async () => { + writeFileSync( + join(testDir, 'package.json'), + JSON.stringify({ scripts: { build: 'go build' } }), + ); + // No tsconfig.json — not a TypeScript project + + const result = await runTypecheckValidation(testDir); + + expect(result.passed).toBe(true); + expect(result.issues).toHaveLength(0); + expect(mockSpawn).not.toHaveBeenCalled(); + }); + it('detects type-check script (hyphenated variant)', async () => { writeFileSync( join(testDir, 'package.json'), diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts index 1a0b3a2..128eee7 100644 --- a/src/lib/validation/quick-checks.ts +++ b/src/lib/validation/quick-checks.ts @@ -155,8 +155,14 @@ async function detectTypecheckCommand(projectDir: string): Promise { describe('DEFAULT_CRITERIA', () => { it('has expected default thresholds', () => { - expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.2); + expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.5); expect(DEFAULT_CRITERIA.withCorrectionPassRate).toBe(0.9); expect(DEFAULT_CRITERIA.withRetryPassRate).toBe(0.95); }); @@ -23,12 +23,12 @@ describe('success-criteria', () => { describe('validateResults', () => { it('returns passed=true when all criteria met', () => { - // 10 results: 4 clean (40% > 30%), 5 corrected (9/10 = 90% correction), 1 retried (100% retry) + // 10 results: 6 clean (60% > 50%), 3 corrected (9/10 = 90% correction), 1 retried (100% retry) const results: EvalResult[] = [ - ...Array(4) + ...Array(6) .fill(null) .map(() => makeResult(true, 1, 0)), - ...Array(5) + ...Array(3) .fill(null) .map(() => makeResult(true, 1, 1)), makeResult(true, 2), @@ -38,16 +38,18 @@ describe('success-criteria', () => { expect(validation.passed).toBe(true); expect(validation.failures).toHaveLength(0); - expect(validation.actual.firstAttemptPassRate).toBe(0.4); + expect(validation.actual.firstAttemptPassRate).toBe(0.6); expect(validation.actual.withCorrectionPassRate).toBe(0.9); expect(validation.actual.withRetryPassRate).toBe(1); }); it('returns passed=false when first-attempt rate below threshold', () => { - // 10 results, only 1 passed on first attempt (10% < 20% threshold) + // 10 results: 4 clean (40% < 50%), 5 corrected (90% correction), 1 retried const results: EvalResult[] = [ - makeResult(true, 1, 0), - ...Array(8) + ...Array(4) + .fill(null) + .map(() => makeResult(true, 1, 0)), + ...Array(5) .fill(null) .map(() => makeResult(true, 1, 1)), makeResult(true, 2), @@ -60,12 +62,12 @@ describe('success-criteria', () => { }); it('returns passed=false when with-retry rate below threshold', () => { - // 10 results: 4 clean, 5 corrected (90% correction), 1 failed → 90% retry < 95% + // 10 results: 6 clean (60%), 3 corrected (90% correction), 1 failed → 90% retry < 95% const results: EvalResult[] = [ - ...Array(4) + ...Array(6) .fill(null) .map(() => makeResult(true, 1, 0)), - ...Array(5) + ...Array(3) .fill(null) .map(() => makeResult(true, 1, 1)), makeResult(false, 3), @@ -79,14 +81,14 @@ describe('success-criteria', () => { }); it('returns both failures when multiple criteria not met', () => { - // 10 results, 2 passed first attempt (20% < 30%), 4 failed entirely (60% < 95% retry) + // 10 results: 2 clean (20% < 50%), 4 corrected, 4 failed (60% < 95% retry) const results: EvalResult[] = [ ...Array(2) .fill(null) - .map(() => makeResult(true, 1)), + .map(() => makeResult(true, 1, 0)), ...Array(4) .fill(null) - .map(() => makeResult(true, 2)), + .map(() => makeResult(true, 1, 1)), ...Array(4) .fill(null) .map(() => makeResult(false, 3)), @@ -131,15 +133,15 @@ describe('success-criteria', () => { it('passes when exactly at threshold', () => { // 20 results: - // 6 clean first-attempt (attempt=1, corrections=0) → 30% first-attempt - // 12 self-corrected (attempt=1, corrections=1) → 18/20 = 90% with-correction + // 10 clean first-attempt (attempt=1, corrections=0) → 50% first-attempt + // 8 self-corrected (attempt=1, corrections=1) → 18/20 = 90% with-correction // 1 passed on scenario retry (attempt=2) → 19/20 = 95% with-retry // 1 failed (attempt=3) const results: EvalResult[] = [ - ...Array(6) + ...Array(10) .fill(null) .map(() => makeResult(true, 1, 0)), - ...Array(12) + ...Array(8) .fill(null) .map(() => makeResult(true, 1, 1)), makeResult(true, 2), diff --git a/tests/evals/success-criteria.ts b/tests/evals/success-criteria.ts index 110ce51..3373397 100644 --- a/tests/evals/success-criteria.ts +++ b/tests/evals/success-criteria.ts @@ -17,7 +17,7 @@ export interface SuccessCriteria { /** Default thresholds for CI enforcement */ export const DEFAULT_CRITERIA: SuccessCriteria = { - firstAttemptPassRate: 0.2, + firstAttemptPassRate: 0.5, withCorrectionPassRate: 0.9, withRetryPassRate: 0.95, }; From 807116c14d80131e410abc0691270020c24b0b16 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 11:35:31 -0600 Subject: [PATCH 09/14] feat: detect build systems beyond package.json for multi-language support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend quick-checks to auto-detect Go (go.mod), Elixir (mix.exs), .NET (*.csproj), and Kotlin/Java (build.gradle) build commands from project files. Interpreted languages (Python, Ruby, PHP) pass through silently — no universal build command exists for them. --- src/lib/validation/build-validator.spec.ts | 116 +++++++++++++++++++++ src/lib/validation/build-validator.ts | 50 ++++++++- src/lib/validation/quick-checks.spec.ts | 20 ++++ src/lib/validation/quick-checks.ts | 60 +++++++++-- 4 files changed, 238 insertions(+), 8 deletions(-) create mode 100644 src/lib/validation/build-validator.spec.ts diff --git a/src/lib/validation/build-validator.spec.ts b/src/lib/validation/build-validator.spec.ts new file mode 100644 index 0000000..41273b7 --- /dev/null +++ b/src/lib/validation/build-validator.spec.ts @@ -0,0 +1,116 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { mkdtempSync, writeFileSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; +import { detectBuildCommand } from './build-validator.js'; + +describe('detectBuildCommand', () => { + let testDir: string; + + beforeEach(() => { + testDir = mkdtempSync(join(tmpdir(), 'build-detect-test-')); + }); + + afterEach(() => { + rmSync(testDir, { recursive: true, force: true }); + }); + + it('detects package.json with build script (pnpm)', async () => { + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'next build' } })); + writeFileSync(join(testDir, 'pnpm-lock.yaml'), ''); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'pnpm', args: ['build'] }); + }); + + it('detects package.json with build script (npm)', async () => { + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'react-scripts build' } })); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'npm', args: ['run', 'build'] }); + }); + + it('skips package.json without build script', async () => { + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { start: 'node index.js' } })); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toBeNull(); + }); + + it('detects go.mod → go build', async () => { + writeFileSync(join(testDir, 'go.mod'), 'module example.com/app\n\ngo 1.21\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'go', args: ['build', './...'] }); + }); + + it('detects mix.exs → mix compile', async () => { + writeFileSync(join(testDir, 'mix.exs'), 'defmodule MyApp.MixProject do\nend\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'mix', args: ['compile'] }); + }); + + it('detects *.csproj → dotnet build', async () => { + writeFileSync(join(testDir, 'MyApp.csproj'), '\n\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'dotnet', args: ['build'] }); + }); + + it('detects build.gradle.kts with gradlew → ./gradlew build', async () => { + writeFileSync(join(testDir, 'build.gradle.kts'), 'plugins { kotlin("jvm") }\n'); + writeFileSync(join(testDir, 'gradlew'), '#!/bin/sh\nexec gradle "$@"\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: './gradlew', args: ['build'] }); + }); + + it('detects build.gradle without gradlew → gradle build', async () => { + writeFileSync(join(testDir, 'build.gradle'), 'apply plugin: "java"\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'gradle', args: ['build'] }); + }); + + it('returns null for empty directory', async () => { + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toBeNull(); + }); + + it('returns null for Python project (no universal build)', async () => { + writeFileSync(join(testDir, 'pyproject.toml'), '[project]\nname = "myapp"\n'); + writeFileSync(join(testDir, 'app.py'), 'print("hello")\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toBeNull(); + }); + + it('returns null for Ruby project (no universal build)', async () => { + writeFileSync(join(testDir, 'Gemfile'), 'source "https://rubygems.org"\ngem "rails"\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toBeNull(); + }); + + it('package.json build script takes priority over go.mod', async () => { + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'tsc' } })); + writeFileSync(join(testDir, 'go.mod'), 'module example.com/app\n'); + writeFileSync(join(testDir, 'pnpm-lock.yaml'), ''); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'pnpm', args: ['build'] }); + }); +}); diff --git a/src/lib/validation/build-validator.ts b/src/lib/validation/build-validator.ts index 2e5fc9f..52836c5 100644 --- a/src/lib/validation/build-validator.ts +++ b/src/lib/validation/build-validator.ts @@ -1,5 +1,5 @@ import { spawn } from 'child_process'; -import { existsSync } from 'fs'; +import { existsSync, readdirSync } from 'fs'; import { readFile } from 'fs/promises'; import { join } from 'path'; import type { ValidationIssue } from './types.js'; @@ -115,6 +115,54 @@ export async function hasBuildScriptInPackageJson(projectDir: string): Promise { + // 1. package.json with build script (JS/TS frameworks) + const pm = detectPackageManager(projectDir); + if (await hasBuildScriptInPackageJson(projectDir)) { + const args = pm === 'npm' ? ['run', 'build'] : ['build']; + return { command: pm, args }; + } + + // 2. Go (go.mod → go build ./...) + if (existsSync(join(projectDir, 'go.mod'))) { + return { command: 'go', args: ['build', './...'] }; + } + + // 3. Elixir (mix.exs → mix compile) + if (existsSync(join(projectDir, 'mix.exs'))) { + return { command: 'mix', args: ['compile'] }; + } + + // 4. .NET (*.csproj → dotnet build) + try { + const files = readdirSync(projectDir); + if (files.some((f) => f.endsWith('.csproj'))) { + return { command: 'dotnet', args: ['build'] }; + } + } catch { + // Can't read directory — skip + } + + // 5. Kotlin/Java (build.gradle.kts or build.gradle → gradlew/gradle build) + if (existsSync(join(projectDir, 'build.gradle.kts')) || existsSync(join(projectDir, 'build.gradle'))) { + const gradlew = existsSync(join(projectDir, 'gradlew')) ? './gradlew' : 'gradle'; + return { command: gradlew, args: ['build'] }; + } + + // Interpreted languages (Python, Ruby, PHP) have no universal build command. + // Return null — quick-checks will skip the build step silently. + return null; +} + export function parseBuildErrors(output: string): string[] { const errors: string[] = []; diff --git a/src/lib/validation/quick-checks.spec.ts b/src/lib/validation/quick-checks.spec.ts index 4b4cf8c..190221f 100644 --- a/src/lib/validation/quick-checks.spec.ts +++ b/src/lib/validation/quick-checks.spec.ts @@ -138,6 +138,26 @@ describe('runQuickChecks', () => { expect(result.results[1].phase).toBe('build'); expect(result.agentRetryPrompt).toContain('build failed'); }); + + it('skips build when no build system detected (e.g., Python project)', async () => { + // Rewrite testDir without a build script or any build system markers + writeFileSync( + join(testDir, 'package.json'), + JSON.stringify({ scripts: { typecheck: 'tsc --noEmit' } }), + ); + + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); // typecheck pass only + + const result = await runQuickChecks(testDir); + + expect(result.passed).toBe(true); + expect(result.results).toHaveLength(2); + expect(result.results[0].phase).toBe('typecheck'); + expect(result.results[1].phase).toBe('build'); + expect(result.results[1].passed).toBe(true); // passed through silently + // Only one spawn call (typecheck) — no spawn for build + expect(mockSpawn).toHaveBeenCalledTimes(1); + }); }); describe('runTypecheckValidation', () => { diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts index 128eee7..6db3132 100644 --- a/src/lib/validation/quick-checks.ts +++ b/src/lib/validation/quick-checks.ts @@ -2,7 +2,7 @@ import { spawn } from 'child_process'; import { readFile } from 'fs/promises'; import { join } from 'path'; import type { QuickCheckResult, QuickChecksOutput, ValidationIssue } from './types.js'; -import { detectPackageManager, parseBuildErrors, runBuildValidation } from './build-validator.js'; +import { detectBuildCommand, detectPackageManager, parseBuildErrors } from './build-validator.js'; const DEFAULT_TYPECHECK_TIMEOUT_MS = 30_000; const DEFAULT_BUILD_TIMEOUT_MS = 60_000; @@ -111,17 +111,63 @@ export async function runTypecheckValidation( } /** - * Run build as a quick check, wrapping the existing runBuildValidation. + * Run build as a quick check using auto-detected build command. + * Supports JS (package.json), Go (go.mod), Elixir (mix.exs), .NET (*.csproj), Kotlin/Java (build.gradle). + * Returns passed when no build system detected — quick-checks are an optimization, not a requirement. */ async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promise { - const buildResult = await runBuildValidation(projectDir, timeoutMs); + const startTime = Date.now(); + const buildCmd = await detectBuildCommand(projectDir); + + if (!buildCmd) { + return { + passed: true, + phase: 'build', + issues: [], + agentPrompt: null, + durationMs: Date.now() - startTime, + }; + } + + const { exitCode, stdout, stderr } = await spawnCommand( + buildCmd.command, + buildCmd.args, + projectDir, + timeoutMs, + ); + + if (exitCode === 0) { + return { + passed: true, + phase: 'build', + issues: [], + agentPrompt: null, + durationMs: Date.now() - startTime, + }; + } + + const output = stdout + stderr; + const errors = parseBuildErrors(output); + const issues: ValidationIssue[] = errors.length > 0 + ? errors.map((e) => ({ + type: 'file' as const, + severity: 'error' as const, + message: `Build error: ${e}`, + hint: 'Fix the error and run build again', + })) + : [{ + type: 'file' as const, + severity: 'error' as const, + message: 'Build failed', + hint: `Run \`${buildCmd.command} ${buildCmd.args.join(' ')}\` to see full output`, + }]; return { - passed: buildResult.success, + passed: false, phase: 'build', - issues: buildResult.issues, - agentPrompt: buildResult.success ? null : formatBuildErrors(buildResult.issues), - durationMs: buildResult.durationMs, + issues, + agentPrompt: formatBuildErrors(issues), + durationMs: Date.now() - startTime, }; } From e5d4abfb7ab60b711ea84528bfa42e9c00626634 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 11:38:21 -0600 Subject: [PATCH 10/14] fix: bump first-attempt threshold to 80% and fix quality grader JSON parsing Raise firstAttemptPassRate from 50% to 80% now that false positives from non-TS projects are eliminated (85.7% observed in latest run). Fix quality grader parsing: the greedy regex matched braces inside tags. Now extracts JSON only after and uses a non-greedy pattern to avoid capturing nested objects. --- tests/evals/graders/quality-grader.ts | 5 ++-- tests/evals/success-criteria.spec.ts | 34 ++++++++++++--------------- tests/evals/success-criteria.ts | 2 +- 3 files changed, 19 insertions(+), 22 deletions(-) diff --git a/tests/evals/graders/quality-grader.ts b/tests/evals/graders/quality-grader.ts index 91165a4..22d1bf6 100644 --- a/tests/evals/graders/quality-grader.ts +++ b/tests/evals/graders/quality-grader.ts @@ -88,8 +88,9 @@ Then, output your final scores as JSON. const thinkingMatch = text.match(/([\s\S]*?)<\/thinking>/); const reasoning = thinkingMatch?.[1]?.trim() || 'No reasoning provided'; - // Extract JSON scores (after thinking block) - const jsonMatch = text.match(/\{[\s\S]*\}/); + // Extract JSON scores — look after tag to avoid matching braces in reasoning + const afterThinking = thinkingMatch ? text.slice(text.indexOf('') + ''.length) : text; + const jsonMatch = afterThinking.match(/\{[^{}]*\}/); if (!jsonMatch) return null; const parsed = JSON.parse(jsonMatch[0]) as Record; diff --git a/tests/evals/success-criteria.spec.ts b/tests/evals/success-criteria.spec.ts index ce746bf..c0e5667 100644 --- a/tests/evals/success-criteria.spec.ts +++ b/tests/evals/success-criteria.spec.ts @@ -15,7 +15,7 @@ function makeResult(passed: boolean, attempts: number = 1, correctionAttempts: n describe('success-criteria', () => { describe('DEFAULT_CRITERIA', () => { it('has expected default thresholds', () => { - expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.5); + expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.8); expect(DEFAULT_CRITERIA.withCorrectionPassRate).toBe(0.9); expect(DEFAULT_CRITERIA.withRetryPassRate).toBe(0.95); }); @@ -23,14 +23,12 @@ describe('success-criteria', () => { describe('validateResults', () => { it('returns passed=true when all criteria met', () => { - // 10 results: 6 clean (60% > 50%), 3 corrected (9/10 = 90% correction), 1 retried (100% retry) + // 10 results: 8 clean (80%), 1 corrected (9/10 = 90% correction), 1 retried (100% retry) const results: EvalResult[] = [ - ...Array(6) + ...Array(8) .fill(null) .map(() => makeResult(true, 1, 0)), - ...Array(3) - .fill(null) - .map(() => makeResult(true, 1, 1)), + makeResult(true, 1, 1), makeResult(true, 2), ]; @@ -38,18 +36,18 @@ describe('success-criteria', () => { expect(validation.passed).toBe(true); expect(validation.failures).toHaveLength(0); - expect(validation.actual.firstAttemptPassRate).toBe(0.6); + expect(validation.actual.firstAttemptPassRate).toBe(0.8); expect(validation.actual.withCorrectionPassRate).toBe(0.9); expect(validation.actual.withRetryPassRate).toBe(1); }); it('returns passed=false when first-attempt rate below threshold', () => { - // 10 results: 4 clean (40% < 50%), 5 corrected (90% correction), 1 retried + // 10 results: 7 clean (70% < 80%), 2 corrected (90% correction), 1 retried const results: EvalResult[] = [ - ...Array(4) + ...Array(7) .fill(null) .map(() => makeResult(true, 1, 0)), - ...Array(5) + ...Array(2) .fill(null) .map(() => makeResult(true, 1, 1)), makeResult(true, 2), @@ -62,14 +60,12 @@ describe('success-criteria', () => { }); it('returns passed=false when with-retry rate below threshold', () => { - // 10 results: 6 clean (60%), 3 corrected (90% correction), 1 failed → 90% retry < 95% + // 10 results: 8 clean (80%), 1 corrected (90% correction), 1 failed → 90% retry < 95% const results: EvalResult[] = [ - ...Array(6) + ...Array(8) .fill(null) .map(() => makeResult(true, 1, 0)), - ...Array(3) - .fill(null) - .map(() => makeResult(true, 1, 1)), + makeResult(true, 1, 1), makeResult(false, 3), ]; @@ -133,15 +129,15 @@ describe('success-criteria', () => { it('passes when exactly at threshold', () => { // 20 results: - // 10 clean first-attempt (attempt=1, corrections=0) → 50% first-attempt - // 8 self-corrected (attempt=1, corrections=1) → 18/20 = 90% with-correction + // 16 clean first-attempt (attempt=1, corrections=0) → 80% first-attempt + // 2 self-corrected (attempt=1, corrections=1) → 18/20 = 90% with-correction // 1 passed on scenario retry (attempt=2) → 19/20 = 95% with-retry // 1 failed (attempt=3) const results: EvalResult[] = [ - ...Array(10) + ...Array(16) .fill(null) .map(() => makeResult(true, 1, 0)), - ...Array(8) + ...Array(2) .fill(null) .map(() => makeResult(true, 1, 1)), makeResult(true, 2), diff --git a/tests/evals/success-criteria.ts b/tests/evals/success-criteria.ts index 3373397..a2c2f63 100644 --- a/tests/evals/success-criteria.ts +++ b/tests/evals/success-criteria.ts @@ -17,7 +17,7 @@ export interface SuccessCriteria { /** Default thresholds for CI enforcement */ export const DEFAULT_CRITERIA: SuccessCriteria = { - firstAttemptPassRate: 0.5, + firstAttemptPassRate: 0.8, withCorrectionPassRate: 0.9, withRetryPassRate: 0.95, }; From b21edf7925830d31a18d80b4dcab3257b486665c Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 11:59:43 -0600 Subject: [PATCH 11/14] chore: formatting --- src/lib/agent-interface.spec.ts | 106 ++++++------------- src/lib/agent-interface.ts | 6 +- src/lib/agent-runner.ts | 11 +- src/lib/validation/quick-checks.spec.ts | 60 +++-------- src/lib/validation/quick-checks.ts | 41 ++++--- src/lib/validation/validator.ts | 13 +-- tests/evals/__tests__/agent-executor.spec.ts | 6 +- tests/evals/reporter.ts | 4 +- tests/evals/success-criteria.ts | 6 +- 9 files changed, 95 insertions(+), 158 deletions(-) diff --git a/src/lib/agent-interface.spec.ts b/src/lib/agent-interface.spec.ts index d627276..b266aec 100644 --- a/src/lib/agent-interface.spec.ts +++ b/src/lib/agent-interface.spec.ts @@ -10,7 +10,11 @@ const { mockQuery, mockConfig } = vi.hoisted(() => ({ proxy: { refreshThresholdMs: 300000 }, nodeVersion: '20', logging: { debugMode: false }, - documentation: { workosDocsUrl: 'https://workos.com/docs', dashboardUrl: 'https://dashboard.workos.com', issuesUrl: 'https://github.com' }, + documentation: { + workosDocsUrl: 'https://workos.com/docs', + dashboardUrl: 'https://dashboard.workos.com', + issuesUrl: 'https://github.com', + }, frameworks: {}, legacy: { oauthPort: 3000 }, branding: { showAsciiArt: false, asciiArt: '', compactAsciiArt: '', useCompact: false }, @@ -149,37 +153,23 @@ describe('runAgent retry loop', () => { }); it('returns retryCount=0 when no retryConfig provided', async () => { - mockQuery.mockImplementation( - createMockSDKResponse([{ text: 'Done!' }]), - ); + mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Done!' }])); - const result = await runAgent( - makeAgentConfig(), - 'Test prompt', - makeOptions(), - undefined, - emitter, - ); + const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter); expect(result.error).toBeUndefined(); expect(result.retryCount).toBe(0); }); it('returns retryCount=0 when validation passes first try', async () => { - mockQuery.mockImplementation( - createMockSDKResponse([{ text: 'Done!' }]), - ); + mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Done!' }])); const validateAndFormat = vi.fn().mockResolvedValue(null); // passes - const result = await runAgent( - makeAgentConfig(), - 'Test prompt', - makeOptions(), - undefined, - emitter, - { maxRetries: 2, validateAndFormat }, - ); + const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, { + maxRetries: 2, + validateAndFormat, + }); expect(result.error).toBeUndefined(); expect(result.retryCount).toBe(0); @@ -199,25 +189,17 @@ describe('runAgent retry loop', () => { it('retries once when validation fails then passes', async () => { // Two turns: initial + one retry - mockQuery.mockImplementation( - createMockSDKResponse([ - { text: 'Initial attempt' }, - { text: 'Fixed it!' }, - ]), - ); + mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Initial attempt' }, { text: 'Fixed it!' }])); - const validateAndFormat = vi.fn() + const validateAndFormat = vi + .fn() .mockResolvedValueOnce('Type error in src/foo.ts') // fail first .mockResolvedValueOnce(null); // pass second - const result = await runAgent( - makeAgentConfig(), - 'Test prompt', - makeOptions(), - undefined, - emitter, - { maxRetries: 2, validateAndFormat }, - ); + const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, { + maxRetries: 2, + validateAndFormat, + }); expect(result.error).toBeUndefined(); expect(result.retryCount).toBe(1); @@ -232,23 +214,15 @@ describe('runAgent retry loop', () => { it('caps at maxRetries when validation always fails', async () => { // Three turns: initial + 2 retries mockQuery.mockImplementation( - createMockSDKResponse([ - { text: 'Attempt 1' }, - { text: 'Attempt 2' }, - { text: 'Attempt 3' }, - ]), + createMockSDKResponse([{ text: 'Attempt 1' }, { text: 'Attempt 2' }, { text: 'Attempt 3' }]), ); const validateAndFormat = vi.fn().mockResolvedValue('Still broken'); - const result = await runAgent( - makeAgentConfig(), - 'Test prompt', - makeOptions(), - undefined, - emitter, - { maxRetries: 2, validateAndFormat }, - ); + const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, { + maxRetries: 2, + validateAndFormat, + }); expect(result.error).toBeUndefined(); expect(result.retryCount).toBe(2); @@ -261,20 +235,14 @@ describe('runAgent retry loop', () => { }); it('preserves existing behavior with maxRetries=0', async () => { - mockQuery.mockImplementation( - createMockSDKResponse([{ text: 'Done!' }]), - ); + mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Done!' }])); const validateAndFormat = vi.fn().mockResolvedValue('Error'); - const result = await runAgent( - makeAgentConfig(), - 'Test prompt', - makeOptions(), - undefined, - emitter, - { maxRetries: 0, validateAndFormat }, - ); + const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, { + maxRetries: 0, + validateAndFormat, + }); expect(result.error).toBeUndefined(); expect(result.retryCount).toBe(0); @@ -283,20 +251,14 @@ describe('runAgent retry loop', () => { }); it('treats validateAndFormat errors as passed', async () => { - mockQuery.mockImplementation( - createMockSDKResponse([{ text: 'Done!' }]), - ); + mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Done!' }])); const validateAndFormat = vi.fn().mockRejectedValue(new Error('Validation crashed')); - const result = await runAgent( - makeAgentConfig(), - 'Test prompt', - makeOptions(), - undefined, - emitter, - { maxRetries: 2, validateAndFormat }, - ); + const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, { + maxRetries: 2, + validateAndFormat, + }); expect(result.error).toBeUndefined(); expect(result.retryCount).toBe(0); diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts index 5b0018c..65b168f 100644 --- a/src/lib/agent-interface.ts +++ b/src/lib/agent-interface.ts @@ -634,7 +634,11 @@ export async function runAgent( resolveCurrentTurn(); } // Let callers observe messages (e.g., for latency tracking in evals) - try { onMessage?.(message); } catch { /* observer errors are non-critical */ } + try { + onMessage?.(message); + } catch { + /* observer errors are non-critical */ + } } const durationMs = Date.now() - startTime; diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts index b2eef7d..a84aef8 100644 --- a/src/lib/agent-runner.ts +++ b/src/lib/agent-runner.ts @@ -120,11 +120,12 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal }; // Build retry config - const retryConfig: RetryConfig | undefined = - options.noValidate ? undefined : { - maxRetries: options.maxRetries ?? 2, - validateAndFormat, - }; + const retryConfig: RetryConfig | undefined = options.noValidate + ? undefined + : { + maxRetries: options.maxRetries ?? 2, + validateAndFormat, + }; // Run agent with retry support — agent gets correction prompts on validation failure const agentResult = await runAgent( diff --git a/src/lib/validation/quick-checks.spec.ts b/src/lib/validation/quick-checks.spec.ts index 190221f..a36dd1e 100644 --- a/src/lib/validation/quick-checks.spec.ts +++ b/src/lib/validation/quick-checks.spec.ts @@ -52,9 +52,7 @@ describe('runQuickChecks', () => { }); it('returns passed=true when both typecheck and build succeed', async () => { - mockSpawn - .mockImplementationOnce(() => createMockProcess(0)) - .mockImplementationOnce(() => createMockProcess(0)); + mockSpawn.mockImplementationOnce(() => createMockProcess(0)).mockImplementationOnce(() => createMockProcess(0)); const result = await runQuickChecks(testDir); @@ -80,9 +78,7 @@ describe('runQuickChecks', () => { }); it('runs build after typecheck passes', async () => { - mockSpawn - .mockImplementationOnce(() => createMockProcess(0)) - .mockImplementationOnce(() => createMockProcess(0)); + mockSpawn.mockImplementationOnce(() => createMockProcess(0)).mockImplementationOnce(() => createMockProcess(0)); const result = await runQuickChecks(testDir); @@ -103,7 +99,8 @@ describe('runQuickChecks', () => { }); it('generates agentRetryPrompt when typecheck fails', async () => { - const tsError = "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'."; + const tsError = + "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'."; mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError)); const result = await runQuickChecks(testDir); @@ -114,9 +111,7 @@ describe('runQuickChecks', () => { }); it('tracks total duration', async () => { - mockSpawn - .mockImplementationOnce(() => createMockProcess(0)) - .mockImplementationOnce(() => createMockProcess(0)); + mockSpawn.mockImplementationOnce(() => createMockProcess(0)).mockImplementationOnce(() => createMockProcess(0)); const result = await runQuickChecks(testDir); @@ -141,10 +136,7 @@ describe('runQuickChecks', () => { it('skips build when no build system detected (e.g., Python project)', async () => { // Rewrite testDir without a build script or any build system markers - writeFileSync( - join(testDir, 'package.json'), - JSON.stringify({ scripts: { typecheck: 'tsc --noEmit' } }), - ); + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { typecheck: 'tsc --noEmit' } })); mockSpawn.mockImplementationOnce(() => createMockProcess(0)); // typecheck pass only @@ -216,8 +208,7 @@ describe('runTypecheckValidation', () => { }); it('handles pretty-printed tsc errors (colon-separated format)', async () => { - const tsError = - "src/app.tsx:10:3 - error TS2322: Type 'number' is not assignable to type 'string'."; + const tsError = "src/app.tsx:10:3 - error TS2322: Type 'number' is not assignable to type 'string'."; mockSpawn.mockImplementationOnce(() => createMockProcess(1, tsError, '')); const result = await runTypecheckValidation(testDir); @@ -227,9 +218,7 @@ describe('runTypecheckValidation', () => { }); it('provides fallback message when errors cannot be parsed', async () => { - mockSpawn.mockImplementationOnce(() => - createMockProcess(1, '', 'Some unknown error format that we cannot parse'), - ); + mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', 'Some unknown error format that we cannot parse')); const result = await runTypecheckValidation(testDir); @@ -243,35 +232,21 @@ describe('runTypecheckValidation', () => { await runTypecheckValidation(testDir); - expect(mockSpawn).toHaveBeenCalledWith( - 'pnpm', - ['typecheck'], - expect.objectContaining({ cwd: testDir }), - ); + expect(mockSpawn).toHaveBeenCalledWith('pnpm', ['typecheck'], expect.objectContaining({ cwd: testDir })); }); it('falls back to npx tsc --noEmit when no typecheck script but tsconfig exists', async () => { - writeFileSync( - join(testDir, 'package.json'), - JSON.stringify({ scripts: { build: 'next build' } }), - ); + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'next build' } })); writeFileSync(join(testDir, 'tsconfig.json'), '{}'); mockSpawn.mockImplementationOnce(() => createMockProcess(0)); await runTypecheckValidation(testDir); - expect(mockSpawn).toHaveBeenCalledWith( - 'npx', - ['tsc', '--noEmit'], - expect.objectContaining({ cwd: testDir }), - ); + expect(mockSpawn).toHaveBeenCalledWith('npx', ['tsc', '--noEmit'], expect.objectContaining({ cwd: testDir })); }); it('skips typecheck when no tsconfig.json and no typecheck script', async () => { - writeFileSync( - join(testDir, 'package.json'), - JSON.stringify({ scripts: { build: 'go build' } }), - ); + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'go build' } })); // No tsconfig.json — not a TypeScript project const result = await runTypecheckValidation(testDir); @@ -282,19 +257,12 @@ describe('runTypecheckValidation', () => { }); it('detects type-check script (hyphenated variant)', async () => { - writeFileSync( - join(testDir, 'package.json'), - JSON.stringify({ scripts: { 'type-check': 'tsc --noEmit' } }), - ); + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { 'type-check': 'tsc --noEmit' } })); mockSpawn.mockImplementationOnce(() => createMockProcess(0)); await runTypecheckValidation(testDir); - expect(mockSpawn).toHaveBeenCalledWith( - 'pnpm', - ['type-check'], - expect.objectContaining({ cwd: testDir }), - ); + expect(mockSpawn).toHaveBeenCalledWith('pnpm', ['type-check'], expect.objectContaining({ cwd: testDir })); }); it('tracks duration', async () => { diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts index 6db3132..3612209 100644 --- a/src/lib/validation/quick-checks.ts +++ b/src/lib/validation/quick-checks.ts @@ -19,10 +19,7 @@ export async function runQuickChecks( const results: QuickCheckResult[] = []; // Step 1: Typecheck - const typecheckResult = await runTypecheckValidation( - projectDir, - options?.timeoutMs ?? DEFAULT_TYPECHECK_TIMEOUT_MS, - ); + const typecheckResult = await runTypecheckValidation(projectDir, options?.timeoutMs ?? DEFAULT_TYPECHECK_TIMEOUT_MS); results.push(typecheckResult); // Step 2: Build — only if typecheck passed and build not skipped @@ -129,12 +126,7 @@ async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promis }; } - const { exitCode, stdout, stderr } = await spawnCommand( - buildCmd.command, - buildCmd.args, - projectDir, - timeoutMs, - ); + const { exitCode, stdout, stderr } = await spawnCommand(buildCmd.command, buildCmd.args, projectDir, timeoutMs); if (exitCode === 0) { return { @@ -148,19 +140,22 @@ async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promis const output = stdout + stderr; const errors = parseBuildErrors(output); - const issues: ValidationIssue[] = errors.length > 0 - ? errors.map((e) => ({ - type: 'file' as const, - severity: 'error' as const, - message: `Build error: ${e}`, - hint: 'Fix the error and run build again', - })) - : [{ - type: 'file' as const, - severity: 'error' as const, - message: 'Build failed', - hint: `Run \`${buildCmd.command} ${buildCmd.args.join(' ')}\` to see full output`, - }]; + const issues: ValidationIssue[] = + errors.length > 0 + ? errors.map((e) => ({ + type: 'file' as const, + severity: 'error' as const, + message: `Build error: ${e}`, + hint: 'Fix the error and run build again', + })) + : [ + { + type: 'file' as const, + severity: 'error' as const, + message: 'Build failed', + hint: `Run \`${buildCmd.command} ${buildCmd.args.join(' ')}\` to see full output`, + }, + ]; return { passed: false, diff --git a/src/lib/validation/validator.ts b/src/lib/validation/validator.ts index b35a2fa..6acaa43 100644 --- a/src/lib/validation/validator.ts +++ b/src/lib/validation/validator.ts @@ -30,12 +30,12 @@ export async function validateInstallation( } // Run validations - issues.push(...await validatePackages(rules, projectDir)); - issues.push(...await validateEnvVars(rules, projectDir)); - issues.push(...await validateFiles(rules, projectDir)); + issues.push(...(await validatePackages(rules, projectDir))); + issues.push(...(await validateEnvVars(rules, projectDir))); + issues.push(...(await validateFiles(rules, projectDir))); // Run framework-specific cross-validations - issues.push(...await validateFrameworkSpecific(framework, projectDir)); + issues.push(...(await validateFrameworkSpecific(framework, projectDir))); // Run build validation if enabled if (options.runBuild !== false) { @@ -220,10 +220,7 @@ export async function validateFiles(rules: ValidationRules, projectDir: string): /** * Framework-specific cross-validations that require reading multiple sources. */ -export async function validateFrameworkSpecific( - framework: string, - projectDir: string, -): Promise { +export async function validateFrameworkSpecific(framework: string, projectDir: string): Promise { const issues: ValidationIssue[] = []; // Universal cross-validations diff --git a/tests/evals/__tests__/agent-executor.spec.ts b/tests/evals/__tests__/agent-executor.spec.ts index 2ca23bc..2316a18 100644 --- a/tests/evals/__tests__/agent-executor.spec.ts +++ b/tests/evals/__tests__/agent-executor.spec.ts @@ -13,7 +13,11 @@ const { mockRunAgent, mockConfig, mockCredentials } = vi.hoisted(() => ({ proxy: { refreshThresholdMs: 300000 }, nodeVersion: '20', logging: { debugMode: false }, - documentation: { workosDocsUrl: 'https://workos.com/docs', dashboardUrl: 'https://dashboard.workos.com', issuesUrl: 'https://github.com' }, + documentation: { + workosDocsUrl: 'https://workos.com/docs', + dashboardUrl: 'https://dashboard.workos.com', + issuesUrl: 'https://github.com', + }, frameworks: {}, legacy: { oauthPort: 3000 }, branding: { showAsciiArt: false, asciiArt: '', compactAsciiArt: '', useCompact: false }, diff --git a/tests/evals/reporter.ts b/tests/evals/reporter.ts index 297641e..316dd7e 100644 --- a/tests/evals/reporter.ts +++ b/tests/evals/reporter.ts @@ -60,7 +60,9 @@ export function printMatrix(results: EvalResult[]): void { const total = results.length; const rate = ((passed / total) * 100).toFixed(1); const selfCorrected = results.filter((r) => r.selfCorrected).length; - console.log(`\nResults: ${passed}/${total} passed (${rate}%)${selfCorrected > 0 ? `, ${selfCorrected} self-corrected` : ''}`); + console.log( + `\nResults: ${passed}/${total} passed (${rate}%)${selfCorrected > 0 ? `, ${selfCorrected} self-corrected` : ''}`, + ); if (passed < total) { console.log('\nFailed scenarios:'); diff --git a/tests/evals/success-criteria.ts b/tests/evals/success-criteria.ts index a2c2f63..6805ed0 100644 --- a/tests/evals/success-criteria.ts +++ b/tests/evals/success-criteria.ts @@ -70,7 +70,11 @@ export function validateResults(results: EvalResult[], criteria: SuccessCriteria return { passed: failures.length === 0, criteria, - actual: { firstAttemptPassRate: firstAttemptRate, withCorrectionPassRate: withCorrectionRate, withRetryPassRate: withRetryRate }, + actual: { + firstAttemptPassRate: firstAttemptRate, + withCorrectionPassRate: withCorrectionRate, + withRetryPassRate: withRetryRate, + }, failures, }; } From 719fd6b8669d3eec5ab6d8c768bfd94ea3c067d4 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 12:06:52 -0600 Subject: [PATCH 12/14] chore: remove comment slop and dead validation:quick events --- src/lib/agent-interface.ts | 16 ++-------------- src/lib/events.ts | 7 ------- src/lib/validation/build-validator.ts | 9 +-------- src/lib/validation/quick-checks.ts | 5 ----- 4 files changed, 3 insertions(+), 34 deletions(-) diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts index 65b168f..6b1b3b5 100644 --- a/src/lib/agent-interface.ts +++ b/src/lib/agent-interface.ts @@ -536,7 +536,6 @@ export async function runAgent( resetTurnSignal(); const createPromptStream = async function* () { - // Initial prompt yield { type: 'user', session_id: '', @@ -544,13 +543,10 @@ export async function runAgent( parent_tool_use_id: null, }; - // Retry loop — yield follow-up correction prompts on validation failure if (retryConfig && maxRetries > 0) { while (retryCount < maxRetries) { - // Wait for agent to finish current turn await currentTurnDone; - // Run validation between turns emitter?.emit('validation:retry:start', { attempt: retryCount + 1 }); let validationPrompt: string | null; @@ -567,14 +563,13 @@ export async function runAgent( passed: validationPrompt === null, }); - if (validationPrompt === null) break; // Validation passed + if (validationPrompt === null) break; retryCount++; emitter?.emit('agent:retry', { attempt: retryCount, maxRetries }); resetTurnSignal(); - // Feed errors back to agent in same conversation yield { type: 'user', session_id: '', @@ -584,7 +579,6 @@ export async function runAgent( } } - // Keep generator alive until the final result is received await currentTurnDone; }; @@ -629,16 +623,10 @@ export async function runAgent( if (messageError) { sdkError = messageError; } - // Signal turn completion when result received — this resumes the generator if (message.type === 'result') { resolveCurrentTurn(); } - // Let callers observe messages (e.g., for latency tracking in evals) - try { - onMessage?.(message); - } catch { - /* observer errors are non-critical */ - } + try { onMessage?.(message); } catch { /* non-critical */ } } const durationMs = Date.now() - startTime; diff --git a/src/lib/events.ts b/src/lib/events.ts index e0a2279..027bd31 100644 --- a/src/lib/events.ts +++ b/src/lib/events.ts @@ -57,13 +57,6 @@ export interface InstallerEvents { 'validation:retry:start': { attempt: number }; 'validation:retry:complete': { attempt: number; passed: boolean }; - 'validation:quick:start': Record; - 'validation:quick:complete': { - passed: boolean; - results: import('./validation/types.js').QuickCheckResult[]; - durationMs: number; - }; - 'validation:start': { framework: string }; 'validation:issues': { issues: import('./validation/types.js').ValidationIssue[] }; 'validation:complete': { passed: boolean; issueCount: number; durationMs: number }; diff --git a/src/lib/validation/build-validator.ts b/src/lib/validation/build-validator.ts index 52836c5..854f96d 100644 --- a/src/lib/validation/build-validator.ts +++ b/src/lib/validation/build-validator.ts @@ -125,41 +125,34 @@ export interface BuildCommand { * Returns null if no build system detected — caller should skip build validation. */ export async function detectBuildCommand(projectDir: string): Promise { - // 1. package.json with build script (JS/TS frameworks) const pm = detectPackageManager(projectDir); if (await hasBuildScriptInPackageJson(projectDir)) { const args = pm === 'npm' ? ['run', 'build'] : ['build']; return { command: pm, args }; } - // 2. Go (go.mod → go build ./...) if (existsSync(join(projectDir, 'go.mod'))) { return { command: 'go', args: ['build', './...'] }; } - // 3. Elixir (mix.exs → mix compile) if (existsSync(join(projectDir, 'mix.exs'))) { return { command: 'mix', args: ['compile'] }; } - // 4. .NET (*.csproj → dotnet build) try { const files = readdirSync(projectDir); if (files.some((f) => f.endsWith('.csproj'))) { return { command: 'dotnet', args: ['build'] }; } } catch { - // Can't read directory — skip + // Can't read directory } - // 5. Kotlin/Java (build.gradle.kts or build.gradle → gradlew/gradle build) if (existsSync(join(projectDir, 'build.gradle.kts')) || existsSync(join(projectDir, 'build.gradle'))) { const gradlew = existsSync(join(projectDir, 'gradlew')) ? './gradlew' : 'gradle'; return { command: gradlew, args: ['build'] }; } - // Interpreted languages (Python, Ruby, PHP) have no universal build command. - // Return null — quick-checks will skip the build step silently. return null; } diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts index 3612209..7ceab1b 100644 --- a/src/lib/validation/quick-checks.ts +++ b/src/lib/validation/quick-checks.ts @@ -109,8 +109,6 @@ export async function runTypecheckValidation( /** * Run build as a quick check using auto-detected build command. - * Supports JS (package.json), Go (go.mod), Elixir (mix.exs), .NET (*.csproj), Kotlin/Java (build.gradle). - * Returns passed when no build system detected — quick-checks are an optimization, not a requirement. */ async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promise { const startTime = Date.now(); @@ -173,7 +171,6 @@ interface TypecheckCommand { /** * Detect the appropriate typecheck command for the project. - * Checks for tsc in node_modules, then framework-specific alternatives. */ async function detectTypecheckCommand(projectDir: string): Promise { const pm = detectPackageManager(projectDir); @@ -234,7 +231,6 @@ function parseTypecheckErrors(output: string): string[] { /** * Format typecheck errors into an agent-ready prompt. - * Turns "TS2345: Argument of type..." into actionable instructions. */ function formatTypecheckErrors(errors: string[], rawOutput: string): string { if (errors.length === 0) { @@ -267,7 +263,6 @@ function formatBuildErrors(issues: ValidationIssue[]): string { /** * Format quick check failures into an agent-ready prompt. - * Combines typecheck and build errors into a single actionable prompt. */ function formatForAgent(results: QuickCheckResult[]): string { const failedResults = results.filter((r) => !r.passed); From 7cd7147b8d095760fed5fda79e1cb805ab7836de Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 15:31:57 -0600 Subject: [PATCH 13/14] refactor: simplify quick-checks, extract shared validateAndFormat, remove dead code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract passResult helper (4 identical object literals → 1 function), unify parseTypecheckErrors into single regex with Set dedup, extract quickCheckValidateAndFormat shared between agent-runner and eval executor, remove getIntegration indirection and dead continueUrl param. --- src/lib/agent-interface.ts | 5 +- src/lib/agent-runner.ts | 49 ++----- src/lib/validation/index.ts | 2 +- src/lib/validation/quick-checks.ts | 142 +++++-------------- tests/evals/__tests__/agent-executor.spec.ts | 2 +- tests/evals/agent-executor.ts | 52 ++----- tests/evals/parallel-runner.ts | 6 +- 7 files changed, 61 insertions(+), 197 deletions(-) diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts index 6b1b3b5..dbe0c99 100644 --- a/src/lib/agent-interface.ts +++ b/src/lib/agent-interface.ts @@ -519,12 +519,11 @@ export async function runAgent( const collectedText: string[] = []; try { - // Retry loop coordination let retryCount = 0; const maxRetries = retryConfig?.maxRetries ?? 0; - // Turn completion signals — the response loop resolves currentTurnDone - // when a 'result' message arrives. The generator awaits it between turns. + // Turn completion signals — resolveCurrentTurn is called when a 'result' + // message arrives; the prompt generator awaits currentTurnDone between turns. let resolveCurrentTurn!: () => void; let currentTurnDone!: Promise; diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts index a84aef8..b554d73 100644 --- a/src/lib/agent-runner.ts +++ b/src/lib/agent-runner.ts @@ -1,5 +1,5 @@ import { SPINNER_MESSAGE, type FrameworkConfig } from './framework-config.js'; -import { validateInstallation, runQuickChecks } from './validation/index.js'; +import { validateInstallation, quickCheckValidateAndFormat } from './validation/index.js'; import type { InstallerOptions } from '../utils/types.js'; import { ensurePackageIsInstalled, @@ -113,18 +113,11 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal options, ); - // Build validation callback for retry loop — uses quick checks from Phase 1 - const validateAndFormat = async (workingDirectory: string): Promise => { - const quickResult = await runQuickChecks(workingDirectory); - return quickResult.passed ? null : quickResult.agentRetryPrompt; - }; - - // Build retry config const retryConfig: RetryConfig | undefined = options.noValidate ? undefined : { maxRetries: options.maxRetries ?? 2, - validateAndFormat, + validateAndFormat: quickCheckValidateAndFormat, }; // Run agent with retry support — agent gets correction prompts on validation failure @@ -190,12 +183,6 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal }); } - // Skip MCP server setup for now (WorkOS doesn't need it initially) - // await addMCPServerToClientsStep({ ... }); - - // Build outro message - const continueUrl = undefined; // No signup flow for WorkOS wizard - const changes = [ ...config.ui.getOutroChanges(frameworkContext), Object.keys(envVars).length > 0 ? `Added environment variables to .env file` : '', @@ -209,8 +196,7 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal : '', ].filter(Boolean); - // Build detailed summary to return to caller (state machine) - const summary = buildCompletionSummary(config, changes, nextSteps, continueUrl); + const summary = buildCompletionSummary(config, changes, nextSteps); await analytics.shutdown('success'); @@ -277,41 +263,24 @@ Report your progress using [STATUS] prefixes. Begin by invoking the ${skillName} skill.`; } -/** - * Build a completion summary for the event payload. - * This is a plain-text summary without styling (adapters handle presentation). - */ -function buildCompletionSummary( - config: FrameworkConfig, - changes: string[], - nextSteps: string[], - continueUrl: string | undefined, -): string { - const lines: string[] = []; - - lines.push('Successfully installed WorkOS AuthKit!'); - lines.push(''); +function buildCompletionSummary(config: FrameworkConfig, changes: string[], nextSteps: string[]): string { + const lines: string[] = ['Successfully installed WorkOS AuthKit!', '']; if (changes.length > 0) { lines.push('What the agent did:'); - changes.forEach((change) => lines.push(`• ${change}`)); + for (const change of changes) lines.push(`• ${change}`); lines.push(''); } if (nextSteps.length > 0) { lines.push('Next steps:'); - nextSteps.forEach((step) => lines.push(`• ${step}`)); + for (const step of nextSteps) lines.push(`• ${step}`); lines.push(''); } - lines.push(`Learn more: ${config.metadata.docsUrl}`); - - if (continueUrl) { - lines.push(`Continue onboarding: ${continueUrl}`); - } - - lines.push(''); lines.push( + `Learn more: ${config.metadata.docsUrl}`, + '', 'Note: This installer uses an LLM agent to analyze and modify your project. Please review the changes made.', ); diff --git a/src/lib/validation/index.ts b/src/lib/validation/index.ts index c450b97..26c6316 100644 --- a/src/lib/validation/index.ts +++ b/src/lib/validation/index.ts @@ -7,7 +7,7 @@ export { type ValidateOptions, } from './validator.js'; export { runBuildValidation, type BuildResult } from './build-validator.js'; -export { runQuickChecks, runTypecheckValidation } from './quick-checks.js'; +export { runQuickChecks, runTypecheckValidation, quickCheckValidateAndFormat } from './quick-checks.js'; export type { ValidationResult, ValidationRules, diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts index 7ceab1b..2a3ec92 100644 --- a/src/lib/validation/quick-checks.ts +++ b/src/lib/validation/quick-checks.ts @@ -18,14 +18,11 @@ export async function runQuickChecks( const startTime = Date.now(); const results: QuickCheckResult[] = []; - // Step 1: Typecheck const typecheckResult = await runTypecheckValidation(projectDir, options?.timeoutMs ?? DEFAULT_TYPECHECK_TIMEOUT_MS); results.push(typecheckResult); - // Step 2: Build — only if typecheck passed and build not skipped if (typecheckResult.passed && !options?.skipBuild) { - const buildResult = await runBuildQuickCheck(projectDir, options?.timeoutMs ?? DEFAULT_BUILD_TIMEOUT_MS); - results.push(buildResult); + results.push(await runBuildQuickCheck(projectDir, options?.timeoutMs ?? DEFAULT_BUILD_TIMEOUT_MS)); } const passed = results.every((r) => r.passed); @@ -38,6 +35,10 @@ export async function runQuickChecks( }; } +function passResult(phase: QuickCheckResult['phase'], startTime: number): QuickCheckResult { + return { passed: true, phase, issues: [], agentPrompt: null, durationMs: Date.now() - startTime }; +} + /** * Run typecheck only (tsc --noEmit or framework equivalent). * Faster than full build — catches type errors in ~5s. @@ -50,14 +51,7 @@ export async function runTypecheckValidation( const typecheckCmd = await detectTypecheckCommand(projectDir); if (!typecheckCmd) { - // No typecheck available — pass through - return { - passed: true, - phase: 'typecheck', - issues: [], - agentPrompt: null, - durationMs: Date.now() - startTime, - }; + return passResult('typecheck', startTime); } const { exitCode, stdout, stderr } = await spawnCommand( @@ -68,25 +62,18 @@ export async function runTypecheckValidation( ); if (exitCode === 0) { - return { - passed: true, - phase: 'typecheck', - issues: [], - agentPrompt: null, - durationMs: Date.now() - startTime, - }; + return passResult('typecheck', startTime); } const output = stdout + stderr; const errors = parseTypecheckErrors(output); const issues: ValidationIssue[] = errors.map((error) => ({ - type: 'file' as const, - severity: 'error' as const, + type: 'file', + severity: 'error', message: `Type error: ${error}`, hint: 'Fix the type error and run typecheck again', })); - // Fallback if no specific errors parsed if (issues.length === 0) { issues.push({ type: 'file', @@ -96,44 +83,27 @@ export async function runTypecheckValidation( }); } - const agentPrompt = formatTypecheckErrors(errors, output); - return { passed: false, phase: 'typecheck', issues, - agentPrompt, + agentPrompt: formatTypecheckErrors(errors, output), durationMs: Date.now() - startTime, }; } -/** - * Run build as a quick check using auto-detected build command. - */ async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promise { const startTime = Date.now(); const buildCmd = await detectBuildCommand(projectDir); if (!buildCmd) { - return { - passed: true, - phase: 'build', - issues: [], - agentPrompt: null, - durationMs: Date.now() - startTime, - }; + return passResult('build', startTime); } const { exitCode, stdout, stderr } = await spawnCommand(buildCmd.command, buildCmd.args, projectDir, timeoutMs); if (exitCode === 0) { - return { - passed: true, - phase: 'build', - issues: [], - agentPrompt: null, - durationMs: Date.now() - startTime, - }; + return passResult('build', startTime); } const output = stdout + stderr; @@ -141,15 +111,15 @@ async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promis const issues: ValidationIssue[] = errors.length > 0 ? errors.map((e) => ({ - type: 'file' as const, - severity: 'error' as const, + type: 'file', + severity: 'error', message: `Build error: ${e}`, hint: 'Fix the error and run build again', })) : [ { - type: 'file' as const, - severity: 'error' as const, + type: 'file', + severity: 'error', message: 'Build failed', hint: `Run \`${buildCmd.command} ${buildCmd.args.join(' ')}\` to see full output`, }, @@ -169,69 +139,39 @@ interface TypecheckCommand { args: string[]; } -/** - * Detect the appropriate typecheck command for the project. - */ async function detectTypecheckCommand(projectDir: string): Promise { const pm = detectPackageManager(projectDir); - // Check for typecheck script in package.json first try { const content = await readFile(join(projectDir, 'package.json'), 'utf-8'); const pkg = JSON.parse(content) as { scripts?: Record }; - if (pkg.scripts?.typecheck) { - const args = pm === 'npm' ? ['run', 'typecheck'] : ['typecheck']; - return { command: pm, args }; - } - - if (pkg.scripts?.['type-check']) { - const args = pm === 'npm' ? ['run', 'type-check'] : ['type-check']; + const scriptName = pkg.scripts?.typecheck ? 'typecheck' : pkg.scripts?.['type-check'] ? 'type-check' : null; + if (scriptName) { + const args = pm === 'npm' ? ['run', scriptName] : [scriptName]; return { command: pm, args }; } } catch { - // No package.json or malformed — continue detection + // No package.json or malformed } - // Only fall back to tsc if the project actually uses TypeScript try { await readFile(join(projectDir, 'tsconfig.json'), 'utf-8'); return { command: 'npx', args: ['tsc', '--noEmit'] }; } catch { - // No tsconfig.json — not a TypeScript project, skip typecheck return null; } } -/** - * Parse TypeScript-specific errors from typecheck output. - */ function parseTypecheckErrors(output: string): string[] { - const errors: string[] = []; - - // TypeScript errors: "src/file.ts(line,col): error TS2345: ..." - const tsErrors = output.match(/[\w./]+\.\w+\(\d+,\d+\):\s*error\s+TS\d+:.+/g); - if (tsErrors) { - errors.push(...tsErrors.slice(0, 10)); - } - - // Also match "src/file.ts:line:col - error TS2345: ..." (tsc --pretty format) - const prettyErrors = output.match(/[\w./]+\.\w+:\d+:\d+\s*-\s*error\s+TS\d+:.+/g); - if (prettyErrors) { - // Dedupe with existing errors - for (const err of prettyErrors.slice(0, 10)) { - if (!errors.some((e) => e.includes(err.split(':')[0]))) { - errors.push(err); - } - } - } - - return errors.slice(0, 10); + // Match both TS error formats: + // src/file.ts(line,col): error TS2345: ... + // src/file.ts:line:col - error TS2345: ... (tsc --pretty) + const pattern = /[\w./]+\.\w+(?:\(\d+,\d+\):\s*|:\d+:\d+\s*-\s*)error\s+TS\d+:.+/g; + const matches = output.match(pattern); + return matches ? [...new Set(matches)].slice(0, 10) : []; } -/** - * Format typecheck errors into an agent-ready prompt. - */ function formatTypecheckErrors(errors: string[], rawOutput: string): string { if (errors.length === 0) { // Couldn't parse specific errors — give raw output @@ -253,35 +193,27 @@ function formatTypecheckErrors(errors: string[], rawOutput: string): string { return `The typecheck failed with ${errors.length} error${errors.length === 1 ? '' : 's'}:\n\n${lines.join('\n')}\n\nFix these type errors in the indicated files.`; } -/** - * Format build errors into an agent-ready prompt. - */ function formatBuildErrors(issues: ValidationIssue[]): string { const errorMessages = issues.map((i) => `- ${i.message}`); return `The build failed:\n\n${errorMessages.join('\n')}\n\nFix these build errors.`; } -/** - * Format quick check failures into an agent-ready prompt. - */ function formatForAgent(results: QuickCheckResult[]): string { - const failedResults = results.filter((r) => !r.passed); - if (failedResults.length === 0) return ''; - - const parts: string[] = []; - - for (const result of failedResults) { - if (result.agentPrompt) { - parts.push(result.agentPrompt); - } - } - - return parts.join('\n\n'); + return results + .filter((r) => !r.passed && r.agentPrompt) + .map((r) => r.agentPrompt!) + .join('\n\n'); } /** - * Spawn a command and collect output. + * Validation callback suitable for RetryConfig.validateAndFormat. + * Returns null if checks pass, or an agent-ready error prompt if they fail. */ +export async function quickCheckValidateAndFormat(workingDirectory: string): Promise { + const result = await runQuickChecks(workingDirectory); + return result.passed ? null : result.agentRetryPrompt; +} + function spawnCommand( command: string, args: string[], diff --git a/tests/evals/__tests__/agent-executor.spec.ts b/tests/evals/__tests__/agent-executor.spec.ts index 2316a18..62f057d 100644 --- a/tests/evals/__tests__/agent-executor.spec.ts +++ b/tests/evals/__tests__/agent-executor.spec.ts @@ -52,7 +52,7 @@ vi.mock('../../../src/lib/settings.js', () => ({ })); vi.mock('../../../src/lib/validation/quick-checks.js', () => ({ - runQuickChecks: vi.fn(), + quickCheckValidateAndFormat: vi.fn(), })); // Mock debug/analytics that agent-interface transitively imports diff --git a/tests/evals/agent-executor.ts b/tests/evals/agent-executor.ts index 73c7e98..0b244d2 100644 --- a/tests/evals/agent-executor.ts +++ b/tests/evals/agent-executor.ts @@ -5,7 +5,7 @@ import { writeEnvLocal } from '../../src/lib/env-writer.js'; import { parseEnvFile } from '../../src/utils/env-parser.js'; import { getConfig } from '../../src/lib/settings.js'; import { LatencyTracker } from './latency-tracker.js'; -import { runQuickChecks } from '../../src/lib/validation/quick-checks.js'; +import { quickCheckValidateAndFormat } from '../../src/lib/validation/quick-checks.js'; import { runAgent, type AgentRunConfig, type RetryConfig } from '../../src/lib/agent-interface.js'; import type { InstallerOptions } from '../../src/utils/types.js'; import type { ToolCall, LatencyMetrics } from './types.js'; @@ -91,19 +91,16 @@ export class AgentExecutor { async run(retryConfig?: AgentRetryConfig): Promise { const config = retryConfig ?? { enabled: true, maxRetries: 2 }; - const integration = this.getIntegration(); const toolCalls: ToolCall[] = []; const collectedOutput: string[] = []; const label = this.options.scenarioName ? `[${this.options.scenarioName}]` : ''; if (this.options.verbose) { - console.log(`${label} Initializing agent for ${integration}...`); + console.log(`${label} Initializing agent for ${this.framework}...`); } - // Start latency tracking this.latencyTracker.start(); - // Write credentials to appropriate env file based on framework const envVars = { WORKOS_API_KEY: this.credentials.workosApiKey, WORKOS_CLIENT_ID: this.credentials.workosClientId, @@ -115,21 +112,18 @@ export class AgentExecutor { writeEnvFile(this.workDir, envVars); } - // Build prompt - const skillName = SKILL_NAMES[integration]; + const skillName = SKILL_NAMES[this.framework]; const prompt = this.buildPrompt(skillName); - // Build SDK environment for direct mode const sdkEnv: Record = { ...process.env, ANTHROPIC_API_KEY: this.credentials.anthropicApiKey, + ANTHROPIC_BASE_URL: undefined, + ANTHROPIC_AUTH_TOKEN: undefined, CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: 'true', CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: 'true', }; - delete sdkEnv.ANTHROPIC_BASE_URL; - delete sdkEnv.ANTHROPIC_AUTH_TOKEN; - // Construct AgentRunConfig directly (bypasses initializeAgent/gateway auth) const agentRunConfig: AgentRunConfig = { workingDirectory: this.workDir, mcpServers: { @@ -143,7 +137,6 @@ export class AgentExecutor { sdkEnv, }; - // Thin InstallerOptions — only what runAgent needs const installerOptions: InstallerOptions = { debug: this.options.verbose ?? false, forceInstall: false, @@ -153,15 +146,8 @@ export class AgentExecutor { skipAuth: true, }; - // Build production RetryConfig with validateAndFormat callback const prodRetryConfig: RetryConfig | undefined = config.enabled - ? { - maxRetries: config.maxRetries, - validateAndFormat: async (workingDirectory: string): Promise => { - const quickResult = await runQuickChecks(workingDirectory); - return quickResult.passed ? null : quickResult.agentRetryPrompt; - }, - } + ? { maxRetries: config.maxRetries, validateAndFormat: quickCheckValidateAndFormat } : undefined; try { @@ -178,34 +164,19 @@ export class AgentExecutor { const latencyMetrics = this.latencyTracker.finish(); const correctionAttempts = result.retryCount ?? 0; + const base = { output: collectedOutput.join('\n'), toolCalls, latencyMetrics, correctionAttempts }; if (result.error) { - return { - success: false, - output: collectedOutput.join('\n'), - toolCalls, - latencyMetrics, - error: result.errorMessage ?? String(result.error), - correctionAttempts, - selfCorrected: false, - }; + return { ...base, success: false, error: result.errorMessage ?? String(result.error), selfCorrected: false }; } - return { - success: true, - output: collectedOutput.join('\n'), - toolCalls, - latencyMetrics, - correctionAttempts, - selfCorrected: correctionAttempts > 0, - }; + return { ...base, success: true, selfCorrected: correctionAttempts > 0 }; } catch (error) { - const latencyMetrics = this.latencyTracker.finish(); return { success: false, output: collectedOutput.join('\n'), toolCalls, - latencyMetrics, + latencyMetrics: this.latencyTracker.finish(), error: error instanceof Error ? error.message : String(error), correctionAttempts: 0, selfCorrected: false, @@ -273,7 +244,4 @@ Begin by invoking the ${skillName} skill.`; } } - private getIntegration(): string { - return this.framework; - } } diff --git a/tests/evals/parallel-runner.ts b/tests/evals/parallel-runner.ts index 2383db7..0af0074 100644 --- a/tests/evals/parallel-runner.ts +++ b/tests/evals/parallel-runner.ts @@ -186,11 +186,7 @@ export class ParallelRunner { if (lastResult && !lastResult.passed) { console.log(`✗ ${scenarioName} FAILED`); - if (!this.options.verbose) { - this.printFailureDetails(lastResult, false); - } else { - this.printFailureDetails(lastResult, true); - } + this.printFailureDetails(lastResult, !!this.options.verbose); evalEvents.emitScenarioFail({ scenario: scenarioName, framework: scenario.framework, From 57047fc458311497749af3568622d0066f35ca08 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Sat, 14 Feb 2026 15:34:47 -0600 Subject: [PATCH 14/14] chore: formatting --- src/lib/agent-interface.ts | 6 +++++- tests/evals/agent-executor.ts | 1 - 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts index dbe0c99..50be6ff 100644 --- a/src/lib/agent-interface.ts +++ b/src/lib/agent-interface.ts @@ -625,7 +625,11 @@ export async function runAgent( if (message.type === 'result') { resolveCurrentTurn(); } - try { onMessage?.(message); } catch { /* non-critical */ } + try { + onMessage?.(message); + } catch { + /* non-critical */ + } } const durationMs = Date.now() - startTime; diff --git a/tests/evals/agent-executor.ts b/tests/evals/agent-executor.ts index 0b244d2..46c2a72 100644 --- a/tests/evals/agent-executor.ts +++ b/tests/evals/agent-executor.ts @@ -243,5 +243,4 @@ Begin by invoking the ${skillName} skill.`; } } } - }