diff --git a/src/lib/agent-interface.spec.ts b/src/lib/agent-interface.spec.ts new file mode 100644 index 0000000..b266aec --- /dev/null +++ b/src/lib/agent-interface.spec.ts @@ -0,0 +1,268 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { EventEmitter } from 'node:events'; + +const { mockQuery, mockConfig } = vi.hoisted(() => ({ + mockQuery: vi.fn(), + mockConfig: { + model: 'test-model', + workos: { clientId: 'client_test', authkitDomain: 'test.workos.com', llmGatewayUrl: 'http://localhost:8000' }, + telemetry: { enabled: false, eventName: 'test_event' }, + proxy: { refreshThresholdMs: 300000 }, + nodeVersion: '20', + logging: { debugMode: false }, + documentation: { + workosDocsUrl: 'https://workos.com/docs', + dashboardUrl: 'https://dashboard.workos.com', + issuesUrl: 'https://github.com', + }, + frameworks: {}, + legacy: { oauthPort: 3000 }, + branding: { showAsciiArt: false, asciiArt: '', compactAsciiArt: '', useCompact: false }, + }, +})); + +vi.mock('@anthropic-ai/claude-agent-sdk', () => ({ + query: (...args: unknown[]) => mockQuery(...args), +})); + +vi.mock('../utils/debug.js', () => ({ + debug: vi.fn(), + logInfo: vi.fn(), + logWarn: vi.fn(), + logError: vi.fn(), + initLogFile: vi.fn(), + getLogFilePath: vi.fn(() => null), +})); + +vi.mock('../utils/analytics.js', () => ({ + analytics: { + capture: vi.fn(), + setTag: vi.fn(), + shutdown: vi.fn(), + llmRequest: vi.fn(), + incrementAgentIterations: vi.fn(), + toolCalled: vi.fn(), + }, +})); + +vi.mock('./settings.js', () => ({ + getConfig: vi.fn(() => mockConfig), + getAuthkitDomain: vi.fn(() => 'test.workos.com'), + getCliAuthClientId: vi.fn(() => 'client_test'), +})); + +vi.mock('./credentials.js', () => ({ + hasCredentials: vi.fn(() => false), + getCredentials: vi.fn(() => null), +})); + +vi.mock('./token-refresh.js', () => ({ + ensureValidToken: vi.fn(async () => ({ success: true })), +})); + +vi.mock('./credential-proxy.js', () => ({ + startCredentialProxy: vi.fn(), +})); + +vi.mock('../utils/urls.js', () => ({ + getLlmGatewayUrlFromHost: vi.fn(() => 'http://localhost:8000'), +})); + +import { runAgent, type RetryConfig } from './agent-interface.js'; +import { InstallerEventEmitter } from './events.js'; +import type { InstallerOptions } from '../utils/types.js'; + +/** + * Create a mock SDK response that consumes the prompt stream and yields + * responses for each prompt message. This models the real SDK behavior: + * the response generator stays alive as long as prompts keep coming. + */ +function createMockSDKResponse(turns: Array<{ text?: string; error?: boolean }>) { + return function mockQueryImpl({ prompt }: { prompt: AsyncIterable; options: unknown }) { + let turnIndex = 0; + + async function* responseGenerator() { + // Consume each prompt message and respond with the corresponding turn + for await (const _promptMsg of prompt) { + if (turnIndex >= turns.length) continue; + + const turn = turns[turnIndex]; + turnIndex++; + + if (turn.text) { + yield { + type: 'assistant', + message: { + content: [{ type: 'text', text: turn.text }], + usage: { input_tokens: 100, output_tokens: 50 }, + model: 'test-model', + }, + }; + } + + yield { + type: 'result', + subtype: turn.error ? 'error' : 'success', + result: turn.text ?? '', + ...(turn.error ? { errors: ['Test error'] } : {}), + }; + } + } + + return responseGenerator(); + }; +} + +function makeAgentConfig() { + return { + workingDirectory: '/tmp/test', + mcpServers: {}, + model: 'test-model', + allowedTools: [], + sdkEnv: {}, + }; +} + +function makeOptions(overrides: Partial = {}): InstallerOptions { + return { + debug: false, + forceInstall: false, + installDir: '/tmp/test', + local: true, + ci: false, + skipAuth: true, + ...overrides, + }; +} + +describe('runAgent retry loop', () => { + let emitter: InstallerEventEmitter; + let emittedEvents: Array<{ event: string; payload: unknown }>; + + beforeEach(() => { + mockQuery.mockReset(); + emitter = new InstallerEventEmitter(); + emittedEvents = []; + + // Capture all events + const originalEmit = emitter.emit.bind(emitter); + emitter.emit = ((event: string, payload: unknown) => { + emittedEvents.push({ event, payload }); + return originalEmit(event, payload); + }) as typeof emitter.emit; + }); + + it('returns retryCount=0 when no retryConfig provided', async () => { + mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Done!' }])); + + const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter); + + expect(result.error).toBeUndefined(); + expect(result.retryCount).toBe(0); + }); + + it('returns retryCount=0 when validation passes first try', async () => { + mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Done!' }])); + + const validateAndFormat = vi.fn().mockResolvedValue(null); // passes + + const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, { + maxRetries: 2, + validateAndFormat, + }); + + expect(result.error).toBeUndefined(); + expect(result.retryCount).toBe(0); + expect(validateAndFormat).toHaveBeenCalledTimes(1); + + // Should emit validation:retry:start and validation:retry:complete + const retryStartEvents = emittedEvents.filter((e) => e.event === 'validation:retry:start'); + const retryCompleteEvents = emittedEvents.filter((e) => e.event === 'validation:retry:complete'); + expect(retryStartEvents).toHaveLength(1); + expect(retryCompleteEvents).toHaveLength(1); + expect(retryCompleteEvents[0].payload).toEqual({ attempt: 1, passed: true }); + + // Should NOT emit agent:retry (no retry happened) + const retryEvents = emittedEvents.filter((e) => e.event === 'agent:retry'); + expect(retryEvents).toHaveLength(0); + }); + + it('retries once when validation fails then passes', async () => { + // Two turns: initial + one retry + mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Initial attempt' }, { text: 'Fixed it!' }])); + + const validateAndFormat = vi + .fn() + .mockResolvedValueOnce('Type error in src/foo.ts') // fail first + .mockResolvedValueOnce(null); // pass second + + const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, { + maxRetries: 2, + validateAndFormat, + }); + + expect(result.error).toBeUndefined(); + expect(result.retryCount).toBe(1); + expect(validateAndFormat).toHaveBeenCalledTimes(2); + + // Should emit agent:retry once + const retryEvents = emittedEvents.filter((e) => e.event === 'agent:retry'); + expect(retryEvents).toHaveLength(1); + expect(retryEvents[0].payload).toEqual({ attempt: 1, maxRetries: 2 }); + }); + + it('caps at maxRetries when validation always fails', async () => { + // Three turns: initial + 2 retries + mockQuery.mockImplementation( + createMockSDKResponse([{ text: 'Attempt 1' }, { text: 'Attempt 2' }, { text: 'Attempt 3' }]), + ); + + const validateAndFormat = vi.fn().mockResolvedValue('Still broken'); + + const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, { + maxRetries: 2, + validateAndFormat, + }); + + expect(result.error).toBeUndefined(); + expect(result.retryCount).toBe(2); + // Called 2 times: after initial + after retry 1 + // NOT called after retry 2 because the loop exits + expect(validateAndFormat).toHaveBeenCalledTimes(2); + + const retryEvents = emittedEvents.filter((e) => e.event === 'agent:retry'); + expect(retryEvents).toHaveLength(2); + }); + + it('preserves existing behavior with maxRetries=0', async () => { + mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Done!' }])); + + const validateAndFormat = vi.fn().mockResolvedValue('Error'); + + const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, { + maxRetries: 0, + validateAndFormat, + }); + + expect(result.error).toBeUndefined(); + expect(result.retryCount).toBe(0); + // validateAndFormat should never be called with maxRetries=0 + expect(validateAndFormat).not.toHaveBeenCalled(); + }); + + it('treats validateAndFormat errors as passed', async () => { + mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Done!' }])); + + const validateAndFormat = vi.fn().mockRejectedValue(new Error('Validation crashed')); + + const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, { + maxRetries: 2, + validateAndFormat, + }); + + expect(result.error).toBeUndefined(); + expect(result.retryCount).toBe(0); + // Should have been called once, threw, treated as passed + expect(validateAndFormat).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts index 9022b3a..50be6ff 100644 --- a/src/lib/agent-interface.ts +++ b/src/lib/agent-interface.ts @@ -72,10 +72,18 @@ export type AgentConfig = { workOSApiHost: string; }; +export interface RetryConfig { + /** Max correction attempts after initial run. Default: 2 */ + maxRetries: number; + /** Run between agent turns. Return null if passed, or error prompt if failed. */ + validateAndFormat: (workingDirectory: string) => Promise; +} + /** - * Internal configuration object returned by initializeAgent + * Configuration object for running the agent. + * Built by initializeAgent (production) or constructed directly (evals). */ -type AgentRunConfig = { +export type AgentRunConfig = { workingDirectory: string; mcpServers: McpServersConfig; model: string; @@ -489,7 +497,9 @@ export async function runAgent( errorMessage?: string; }, emitter?: InstallerEventEmitter, -): Promise<{ error?: AgentErrorType; errorMessage?: string }> { + retryConfig?: RetryConfig, + onMessage?: (message: SDKMessage) => void, +): Promise<{ error?: AgentErrorType; errorMessage?: string; retryCount?: number }> { const { spinnerMessage = 'Setting up WorkOS AuthKit...', successMessage = 'WorkOS AuthKit integration complete', @@ -509,15 +519,20 @@ export async function runAgent( const collectedText: string[] = []; try { - // Workaround for SDK bug: stdin closes before canUseTool responses can be sent. - // The fix is to use an async generator for the prompt that stays open until - // the result is received, keeping the stdin stream alive for permission responses. - // See: https://github.com/anthropics/claude-code/issues/4775 - // See: https://github.com/anthropics/claude-agent-sdk-typescript/issues/41 - let signalDone: () => void; - const resultReceived = new Promise((resolve) => { - signalDone = resolve; - }); + let retryCount = 0; + const maxRetries = retryConfig?.maxRetries ?? 0; + + // Turn completion signals — resolveCurrentTurn is called when a 'result' + // message arrives; the prompt generator awaits currentTurnDone between turns. + let resolveCurrentTurn!: () => void; + let currentTurnDone!: Promise; + + function resetTurnSignal() { + currentTurnDone = new Promise((resolve) => { + resolveCurrentTurn = resolve; + }); + } + resetTurnSignal(); const createPromptStream = async function* () { yield { @@ -526,7 +541,44 @@ export async function runAgent( message: { role: 'user', content: prompt }, parent_tool_use_id: null, }; - await resultReceived; + + if (retryConfig && maxRetries > 0) { + while (retryCount < maxRetries) { + await currentTurnDone; + + emitter?.emit('validation:retry:start', { attempt: retryCount + 1 }); + + let validationPrompt: string | null; + try { + validationPrompt = await retryConfig.validateAndFormat(agentConfig.workingDirectory); + } catch (err) { + // Don't block on validation bugs — treat as passed + logError('validateAndFormat threw:', err); + validationPrompt = null; + } + + emitter?.emit('validation:retry:complete', { + attempt: retryCount + 1, + passed: validationPrompt === null, + }); + + if (validationPrompt === null) break; + + retryCount++; + emitter?.emit('agent:retry', { attempt: retryCount, maxRetries }); + + resetTurnSignal(); + + yield { + type: 'user', + session_id: '', + message: { role: 'user', content: validationPrompt }, + parent_tool_use_id: null, + }; + } + } + + await currentTurnDone; }; // Load plugin with bundled skills @@ -570,9 +622,13 @@ export async function runAgent( if (messageError) { sdkError = messageError; } - // Signal completion when result received if (message.type === 'result') { - signalDone!(); + resolveCurrentTurn(); + } + try { + onMessage?.(message); + } catch { + /* non-critical */ } } @@ -597,15 +653,18 @@ export async function runAgent( return { error: AgentErrorType.RESOURCE_MISSING, errorMessage: 'Could not access setup resource' }; } - logInfo(`Agent run completed in ${Math.round(durationMs / 1000)}s`); + logInfo(`Agent run completed in ${Math.round(durationMs / 1000)}s (${retryCount} retries)`); analytics.capture(INSTALLER_INTERACTION_EVENT_NAME, { action: 'agent integration completed', duration_ms: durationMs, duration_seconds: Math.round(durationMs / 1000), + retry_count: retryCount, + max_retries: maxRetries, + passed_after_retry: retryCount > 0, }); // Don't emit agent:success here - let the state machine handle lifecycle events - return {}; + return { retryCount }; } catch (error) { // Don't emit events here - just log and re-throw for state machine to handle logError('Agent run failed:', error); diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts index b6fcabf..b554d73 100644 --- a/src/lib/agent-runner.ts +++ b/src/lib/agent-runner.ts @@ -1,5 +1,5 @@ import { SPINNER_MESSAGE, type FrameworkConfig } from './framework-config.js'; -import { validateInstallation } from './validation/index.js'; +import { validateInstallation, quickCheckValidateAndFormat } from './validation/index.js'; import type { InstallerOptions } from '../utils/types.js'; import { ensurePackageIsInstalled, @@ -9,7 +9,7 @@ import { } from '../utils/clack-utils.js'; import { analytics } from '../utils/analytics.js'; import { INSTALLER_INTERACTION_EVENT_NAME } from './constants.js'; -import { initializeAgent, runAgent } from './agent-interface.js'; +import { initializeAgent, runAgent, type RetryConfig } from './agent-interface.js'; import { uploadEnvironmentVariablesStep } from '../steps/index.js'; import { autoConfigureWorkOSEnvironment } from './workos-management.js'; import { detectPort, getCallbackPath } from './port-detection.js'; @@ -113,7 +113,14 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal options, ); - // Run agent - errors will throw naturally with skill-based approach + const retryConfig: RetryConfig | undefined = options.noValidate + ? undefined + : { + maxRetries: options.maxRetries ?? 2, + validateAndFormat: quickCheckValidateAndFormat, + }; + + // Run agent with retry support — agent gets correction prompts on validation failure const agentResult = await runAgent( agent, integrationPrompt, @@ -124,6 +131,7 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal errorMessage: 'Integration failed', }, options.emitter, + retryConfig, ); // If agent returned an error, throw so state machine can handle it @@ -133,12 +141,23 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal throw new Error(`Agent SDK error: ${message}`); } - // Run post-installation validation + // Track retry metrics + if (agentResult.retryCount !== undefined && agentResult.retryCount > 0) { + analytics.capture(INSTALLER_INTERACTION_EVENT_NAME, { + action: 'agent retry summary', + retry_count: agentResult.retryCount, + max_retries: options.maxRetries ?? 2, + passed_after_retry: true, + }); + } + + // Run full validation after agent (with retries) completes + // Quick checks already ran inside the retry loop — skip build if (!options.noValidate) { options.emitter?.emit('validation:start', { framework: config.metadata.integration }); const validationResult = await validateInstallation(config.metadata.integration, options.installDir, { - runBuild: true, + runBuild: false, }); if (validationResult.issues.length > 0) { @@ -164,12 +183,6 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal }); } - // Skip MCP server setup for now (WorkOS doesn't need it initially) - // await addMCPServerToClientsStep({ ... }); - - // Build outro message - const continueUrl = undefined; // No signup flow for WorkOS wizard - const changes = [ ...config.ui.getOutroChanges(frameworkContext), Object.keys(envVars).length > 0 ? `Added environment variables to .env file` : '', @@ -183,8 +196,7 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal : '', ].filter(Boolean); - // Build detailed summary to return to caller (state machine) - const summary = buildCompletionSummary(config, changes, nextSteps, continueUrl); + const summary = buildCompletionSummary(config, changes, nextSteps); await analytics.shutdown('success'); @@ -251,41 +263,24 @@ Report your progress using [STATUS] prefixes. Begin by invoking the ${skillName} skill.`; } -/** - * Build a completion summary for the event payload. - * This is a plain-text summary without styling (adapters handle presentation). - */ -function buildCompletionSummary( - config: FrameworkConfig, - changes: string[], - nextSteps: string[], - continueUrl: string | undefined, -): string { - const lines: string[] = []; - - lines.push('Successfully installed WorkOS AuthKit!'); - lines.push(''); +function buildCompletionSummary(config: FrameworkConfig, changes: string[], nextSteps: string[]): string { + const lines: string[] = ['Successfully installed WorkOS AuthKit!', '']; if (changes.length > 0) { lines.push('What the agent did:'); - changes.forEach((change) => lines.push(`• ${change}`)); + for (const change of changes) lines.push(`• ${change}`); lines.push(''); } if (nextSteps.length > 0) { lines.push('Next steps:'); - nextSteps.forEach((step) => lines.push(`• ${step}`)); + for (const step of nextSteps) lines.push(`• ${step}`); lines.push(''); } - lines.push(`Learn more: ${config.metadata.docsUrl}`); - - if (continueUrl) { - lines.push(`Continue onboarding: ${continueUrl}`); - } - - lines.push(''); lines.push( + `Learn more: ${config.metadata.docsUrl}`, + '', 'Note: This installer uses an LLM agent to analyze and modify your project. Please review the changes made.', ); diff --git a/src/lib/events.ts b/src/lib/events.ts index cec5cc9..027bd31 100644 --- a/src/lib/events.ts +++ b/src/lib/events.ts @@ -52,6 +52,10 @@ export interface InstallerEvents { 'agent:progress': { step: string; detail?: string }; 'agent:success': { summary?: string }; 'agent:failure': { message: string; stack?: string }; + 'agent:retry': { attempt: number; maxRetries: number }; + + 'validation:retry:start': { attempt: number }; + 'validation:retry:complete': { attempt: number; passed: boolean }; 'validation:start': { framework: string }; 'validation:issues': { issues: import('./validation/types.js').ValidationIssue[] }; diff --git a/src/lib/validation/build-validator.spec.ts b/src/lib/validation/build-validator.spec.ts new file mode 100644 index 0000000..41273b7 --- /dev/null +++ b/src/lib/validation/build-validator.spec.ts @@ -0,0 +1,116 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { mkdtempSync, writeFileSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; +import { detectBuildCommand } from './build-validator.js'; + +describe('detectBuildCommand', () => { + let testDir: string; + + beforeEach(() => { + testDir = mkdtempSync(join(tmpdir(), 'build-detect-test-')); + }); + + afterEach(() => { + rmSync(testDir, { recursive: true, force: true }); + }); + + it('detects package.json with build script (pnpm)', async () => { + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'next build' } })); + writeFileSync(join(testDir, 'pnpm-lock.yaml'), ''); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'pnpm', args: ['build'] }); + }); + + it('detects package.json with build script (npm)', async () => { + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'react-scripts build' } })); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'npm', args: ['run', 'build'] }); + }); + + it('skips package.json without build script', async () => { + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { start: 'node index.js' } })); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toBeNull(); + }); + + it('detects go.mod → go build', async () => { + writeFileSync(join(testDir, 'go.mod'), 'module example.com/app\n\ngo 1.21\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'go', args: ['build', './...'] }); + }); + + it('detects mix.exs → mix compile', async () => { + writeFileSync(join(testDir, 'mix.exs'), 'defmodule MyApp.MixProject do\nend\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'mix', args: ['compile'] }); + }); + + it('detects *.csproj → dotnet build', async () => { + writeFileSync(join(testDir, 'MyApp.csproj'), '\n\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'dotnet', args: ['build'] }); + }); + + it('detects build.gradle.kts with gradlew → ./gradlew build', async () => { + writeFileSync(join(testDir, 'build.gradle.kts'), 'plugins { kotlin("jvm") }\n'); + writeFileSync(join(testDir, 'gradlew'), '#!/bin/sh\nexec gradle "$@"\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: './gradlew', args: ['build'] }); + }); + + it('detects build.gradle without gradlew → gradle build', async () => { + writeFileSync(join(testDir, 'build.gradle'), 'apply plugin: "java"\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'gradle', args: ['build'] }); + }); + + it('returns null for empty directory', async () => { + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toBeNull(); + }); + + it('returns null for Python project (no universal build)', async () => { + writeFileSync(join(testDir, 'pyproject.toml'), '[project]\nname = "myapp"\n'); + writeFileSync(join(testDir, 'app.py'), 'print("hello")\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toBeNull(); + }); + + it('returns null for Ruby project (no universal build)', async () => { + writeFileSync(join(testDir, 'Gemfile'), 'source "https://rubygems.org"\ngem "rails"\n'); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toBeNull(); + }); + + it('package.json build script takes priority over go.mod', async () => { + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'tsc' } })); + writeFileSync(join(testDir, 'go.mod'), 'module example.com/app\n'); + writeFileSync(join(testDir, 'pnpm-lock.yaml'), ''); + + const cmd = await detectBuildCommand(testDir); + + expect(cmd).toEqual({ command: 'pnpm', args: ['build'] }); + }); +}); diff --git a/src/lib/validation/build-validator.ts b/src/lib/validation/build-validator.ts index 8debd15..854f96d 100644 --- a/src/lib/validation/build-validator.ts +++ b/src/lib/validation/build-validator.ts @@ -1,5 +1,5 @@ import { spawn } from 'child_process'; -import { existsSync } from 'fs'; +import { existsSync, readdirSync } from 'fs'; import { readFile } from 'fs/promises'; import { join } from 'path'; import type { ValidationIssue } from './types.js'; @@ -99,13 +99,13 @@ export async function runBuildValidation(projectDir: string, timeoutMs: number = }); } -function detectPackageManager(projectDir: string): 'pnpm' | 'yarn' | 'npm' { +export function detectPackageManager(projectDir: string): 'pnpm' | 'yarn' | 'npm' { if (existsSync(join(projectDir, 'pnpm-lock.yaml'))) return 'pnpm'; if (existsSync(join(projectDir, 'yarn.lock'))) return 'yarn'; return 'npm'; } -async function hasBuildScriptInPackageJson(projectDir: string): Promise { +export async function hasBuildScriptInPackageJson(projectDir: string): Promise { try { const content = await readFile(join(projectDir, 'package.json'), 'utf-8'); const pkg = JSON.parse(content) as { scripts?: { build?: string } }; @@ -115,7 +115,48 @@ async function hasBuildScriptInPackageJson(projectDir: string): Promise } } -function parseBuildErrors(output: string): string[] { +export interface BuildCommand { + command: string; + args: string[]; +} + +/** + * Detect the build command for a project by checking ecosystem markers. + * Returns null if no build system detected — caller should skip build validation. + */ +export async function detectBuildCommand(projectDir: string): Promise { + const pm = detectPackageManager(projectDir); + if (await hasBuildScriptInPackageJson(projectDir)) { + const args = pm === 'npm' ? ['run', 'build'] : ['build']; + return { command: pm, args }; + } + + if (existsSync(join(projectDir, 'go.mod'))) { + return { command: 'go', args: ['build', './...'] }; + } + + if (existsSync(join(projectDir, 'mix.exs'))) { + return { command: 'mix', args: ['compile'] }; + } + + try { + const files = readdirSync(projectDir); + if (files.some((f) => f.endsWith('.csproj'))) { + return { command: 'dotnet', args: ['build'] }; + } + } catch { + // Can't read directory + } + + if (existsSync(join(projectDir, 'build.gradle.kts')) || existsSync(join(projectDir, 'build.gradle'))) { + const gradlew = existsSync(join(projectDir, 'gradlew')) ? './gradlew' : 'gradle'; + return { command: gradlew, args: ['build'] }; + } + + return null; +} + +export function parseBuildErrors(output: string): string[] { const errors: string[] = []; // TypeScript errors: "file.ts(line,col): error TS..." diff --git a/src/lib/validation/index.ts b/src/lib/validation/index.ts index 49e74d2..26c6316 100644 --- a/src/lib/validation/index.ts +++ b/src/lib/validation/index.ts @@ -1,5 +1,13 @@ -export { validateInstallation, type ValidateOptions } from './validator.js'; +export { + validateInstallation, + validatePackages, + validateEnvVars, + validateFiles, + validateFrameworkSpecific, + type ValidateOptions, +} from './validator.js'; export { runBuildValidation, type BuildResult } from './build-validator.js'; +export { runQuickChecks, runTypecheckValidation, quickCheckValidateAndFormat } from './quick-checks.js'; export type { ValidationResult, ValidationRules, @@ -10,4 +18,6 @@ export type { EnvVarRule, FileRule, VariantRules, + QuickCheckResult, + QuickChecksOutput, } from './types.js'; diff --git a/src/lib/validation/quick-checks.spec.ts b/src/lib/validation/quick-checks.spec.ts new file mode 100644 index 0000000..a36dd1e --- /dev/null +++ b/src/lib/validation/quick-checks.spec.ts @@ -0,0 +1,276 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { mkdtempSync, writeFileSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; +import { EventEmitter } from 'node:events'; + +// Mock child_process.spawn to avoid actually running tsc/build +vi.mock('child_process', () => ({ + spawn: vi.fn(), +})); + +import { spawn } from 'child_process'; +import { runQuickChecks, runTypecheckValidation } from './quick-checks.js'; + +const mockSpawn = vi.mocked(spawn); + +/** + * Creates a mock process lazily — must be used inside mockImplementationOnce, + * NOT mockReturnValueOnce, so the setTimeout fires after event listeners attach. + */ +function createMockProcess(exitCode: number, stdout = '', stderr = '') { + const proc = new EventEmitter() as any; + proc.stdout = new EventEmitter(); + proc.stderr = new EventEmitter(); + + setTimeout(() => { + if (stdout) proc.stdout.emit('data', Buffer.from(stdout)); + if (stderr) proc.stderr.emit('data', Buffer.from(stderr)); + proc.emit('close', exitCode); + }, 10); + + return proc; +} + +describe('runQuickChecks', () => { + let testDir: string; + + beforeEach(() => { + testDir = mkdtempSync(join(tmpdir(), 'quick-checks-test-')); + writeFileSync( + join(testDir, 'package.json'), + JSON.stringify({ + scripts: { typecheck: 'tsc --noEmit', build: 'next build' }, + }), + ); + writeFileSync(join(testDir, 'pnpm-lock.yaml'), ''); + mockSpawn.mockReset(); + }); + + afterEach(() => { + rmSync(testDir, { recursive: true, force: true }); + }); + + it('returns passed=true when both typecheck and build succeed', async () => { + mockSpawn.mockImplementationOnce(() => createMockProcess(0)).mockImplementationOnce(() => createMockProcess(0)); + + const result = await runQuickChecks(testDir); + + expect(result.passed).toBe(true); + expect(result.results).toHaveLength(2); + expect(result.results[0].phase).toBe('typecheck'); + expect(result.results[1].phase).toBe('build'); + expect(result.agentRetryPrompt).toBeNull(); + }); + + it('short-circuits build when typecheck fails', async () => { + const tsError = "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable"; + + mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError)); + + const result = await runQuickChecks(testDir); + + expect(result.passed).toBe(false); + expect(result.results).toHaveLength(1); + expect(result.results[0].phase).toBe('typecheck'); + expect(result.results[0].passed).toBe(false); + expect(mockSpawn).toHaveBeenCalledTimes(1); + }); + + it('runs build after typecheck passes', async () => { + mockSpawn.mockImplementationOnce(() => createMockProcess(0)).mockImplementationOnce(() => createMockProcess(0)); + + const result = await runQuickChecks(testDir); + + expect(result.passed).toBe(true); + expect(result.results).toHaveLength(2); + expect(mockSpawn).toHaveBeenCalledTimes(2); + }); + + it('skips build when skipBuild option is true', async () => { + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); + + const result = await runQuickChecks(testDir, { skipBuild: true }); + + expect(result.passed).toBe(true); + expect(result.results).toHaveLength(1); + expect(result.results[0].phase).toBe('typecheck'); + expect(mockSpawn).toHaveBeenCalledTimes(1); + }); + + it('generates agentRetryPrompt when typecheck fails', async () => { + const tsError = + "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'."; + mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError)); + + const result = await runQuickChecks(testDir); + + expect(result.agentRetryPrompt).not.toBeNull(); + expect(result.agentRetryPrompt).toContain('typecheck failed'); + expect(result.agentRetryPrompt).toContain('src/middleware.ts'); + }); + + it('tracks total duration', async () => { + mockSpawn.mockImplementationOnce(() => createMockProcess(0)).mockImplementationOnce(() => createMockProcess(0)); + + const result = await runQuickChecks(testDir); + + expect(typeof result.totalDurationMs).toBe('number'); + expect(result.totalDurationMs).toBeGreaterThanOrEqual(0); + }); + + it('reports build failure when typecheck passes but build fails', async () => { + mockSpawn + .mockImplementationOnce(() => createMockProcess(0)) // typecheck pass + .mockImplementationOnce(() => createMockProcess(1, '', 'Error: Build failed')); // build fail + + const result = await runQuickChecks(testDir); + + expect(result.passed).toBe(false); + expect(result.results).toHaveLength(2); + expect(result.results[0].passed).toBe(true); + expect(result.results[1].passed).toBe(false); + expect(result.results[1].phase).toBe('build'); + expect(result.agentRetryPrompt).toContain('build failed'); + }); + + it('skips build when no build system detected (e.g., Python project)', async () => { + // Rewrite testDir without a build script or any build system markers + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { typecheck: 'tsc --noEmit' } })); + + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); // typecheck pass only + + const result = await runQuickChecks(testDir); + + expect(result.passed).toBe(true); + expect(result.results).toHaveLength(2); + expect(result.results[0].phase).toBe('typecheck'); + expect(result.results[1].phase).toBe('build'); + expect(result.results[1].passed).toBe(true); // passed through silently + // Only one spawn call (typecheck) — no spawn for build + expect(mockSpawn).toHaveBeenCalledTimes(1); + }); +}); + +describe('runTypecheckValidation', () => { + let testDir: string; + + beforeEach(() => { + testDir = mkdtempSync(join(tmpdir(), 'typecheck-test-')); + writeFileSync( + join(testDir, 'package.json'), + JSON.stringify({ + scripts: { typecheck: 'tsc --noEmit' }, + }), + ); + writeFileSync(join(testDir, 'pnpm-lock.yaml'), ''); + mockSpawn.mockReset(); + }); + + afterEach(() => { + rmSync(testDir, { recursive: true, force: true }); + }); + + it('returns passed=true when typecheck succeeds', async () => { + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); + + const result = await runTypecheckValidation(testDir); + + expect(result.passed).toBe(true); + expect(result.phase).toBe('typecheck'); + expect(result.issues).toHaveLength(0); + expect(result.agentPrompt).toBeNull(); + }); + + it('parses TypeScript errors from output', async () => { + const tsError = + "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'."; + mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError)); + + const result = await runTypecheckValidation(testDir); + + expect(result.passed).toBe(false); + expect(result.issues.length).toBeGreaterThan(0); + expect(result.issues[0].message).toContain('Type error'); + expect(result.issues[0].severity).toBe('error'); + }); + + it('formats errors into actionable agent prompt', async () => { + const tsError = + "src/middleware.ts(42,5): error TS2345: Argument of type 'string | undefined' is not assignable to type 'string'."; + mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', tsError)); + + const result = await runTypecheckValidation(testDir); + + expect(result.agentPrompt).not.toBeNull(); + expect(result.agentPrompt).toContain('src/middleware.ts'); + expect(result.agentPrompt).toContain('not assignable'); + }); + + it('handles pretty-printed tsc errors (colon-separated format)', async () => { + const tsError = "src/app.tsx:10:3 - error TS2322: Type 'number' is not assignable to type 'string'."; + mockSpawn.mockImplementationOnce(() => createMockProcess(1, tsError, '')); + + const result = await runTypecheckValidation(testDir); + + expect(result.passed).toBe(false); + expect(result.issues.length).toBeGreaterThan(0); + }); + + it('provides fallback message when errors cannot be parsed', async () => { + mockSpawn.mockImplementationOnce(() => createMockProcess(1, '', 'Some unknown error format that we cannot parse')); + + const result = await runTypecheckValidation(testDir); + + expect(result.passed).toBe(false); + expect(result.issues).toHaveLength(1); + expect(result.issues[0].message).toBe('Typecheck failed'); + }); + + it('uses typecheck script from package.json when available', async () => { + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); + + await runTypecheckValidation(testDir); + + expect(mockSpawn).toHaveBeenCalledWith('pnpm', ['typecheck'], expect.objectContaining({ cwd: testDir })); + }); + + it('falls back to npx tsc --noEmit when no typecheck script but tsconfig exists', async () => { + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'next build' } })); + writeFileSync(join(testDir, 'tsconfig.json'), '{}'); + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); + + await runTypecheckValidation(testDir); + + expect(mockSpawn).toHaveBeenCalledWith('npx', ['tsc', '--noEmit'], expect.objectContaining({ cwd: testDir })); + }); + + it('skips typecheck when no tsconfig.json and no typecheck script', async () => { + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { build: 'go build' } })); + // No tsconfig.json — not a TypeScript project + + const result = await runTypecheckValidation(testDir); + + expect(result.passed).toBe(true); + expect(result.issues).toHaveLength(0); + expect(mockSpawn).not.toHaveBeenCalled(); + }); + + it('detects type-check script (hyphenated variant)', async () => { + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ scripts: { 'type-check': 'tsc --noEmit' } })); + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); + + await runTypecheckValidation(testDir); + + expect(mockSpawn).toHaveBeenCalledWith('pnpm', ['type-check'], expect.objectContaining({ cwd: testDir })); + }); + + it('tracks duration', async () => { + mockSpawn.mockImplementationOnce(() => createMockProcess(0)); + + const result = await runTypecheckValidation(testDir); + + expect(typeof result.durationMs).toBe('number'); + expect(result.durationMs).toBeGreaterThanOrEqual(0); + }); +}); diff --git a/src/lib/validation/quick-checks.ts b/src/lib/validation/quick-checks.ts new file mode 100644 index 0000000..2a3ec92 --- /dev/null +++ b/src/lib/validation/quick-checks.ts @@ -0,0 +1,248 @@ +import { spawn } from 'child_process'; +import { readFile } from 'fs/promises'; +import { join } from 'path'; +import type { QuickCheckResult, QuickChecksOutput, ValidationIssue } from './types.js'; +import { detectBuildCommand, detectPackageManager, parseBuildErrors } from './build-validator.js'; + +const DEFAULT_TYPECHECK_TIMEOUT_MS = 30_000; +const DEFAULT_BUILD_TIMEOUT_MS = 60_000; + +/** + * Run fast deterministic checks: typecheck first, then build. + * Short-circuits: if typecheck fails, skip build (build will fail too). + */ +export async function runQuickChecks( + projectDir: string, + options?: { skipBuild?: boolean; timeoutMs?: number }, +): Promise { + const startTime = Date.now(); + const results: QuickCheckResult[] = []; + + const typecheckResult = await runTypecheckValidation(projectDir, options?.timeoutMs ?? DEFAULT_TYPECHECK_TIMEOUT_MS); + results.push(typecheckResult); + + if (typecheckResult.passed && !options?.skipBuild) { + results.push(await runBuildQuickCheck(projectDir, options?.timeoutMs ?? DEFAULT_BUILD_TIMEOUT_MS)); + } + + const passed = results.every((r) => r.passed); + + return { + passed, + results, + agentRetryPrompt: passed ? null : formatForAgent(results), + totalDurationMs: Date.now() - startTime, + }; +} + +function passResult(phase: QuickCheckResult['phase'], startTime: number): QuickCheckResult { + return { passed: true, phase, issues: [], agentPrompt: null, durationMs: Date.now() - startTime }; +} + +/** + * Run typecheck only (tsc --noEmit or framework equivalent). + * Faster than full build — catches type errors in ~5s. + */ +export async function runTypecheckValidation( + projectDir: string, + timeoutMs: number = DEFAULT_TYPECHECK_TIMEOUT_MS, +): Promise { + const startTime = Date.now(); + const typecheckCmd = await detectTypecheckCommand(projectDir); + + if (!typecheckCmd) { + return passResult('typecheck', startTime); + } + + const { exitCode, stdout, stderr } = await spawnCommand( + typecheckCmd.command, + typecheckCmd.args, + projectDir, + timeoutMs, + ); + + if (exitCode === 0) { + return passResult('typecheck', startTime); + } + + const output = stdout + stderr; + const errors = parseTypecheckErrors(output); + const issues: ValidationIssue[] = errors.map((error) => ({ + type: 'file', + severity: 'error', + message: `Type error: ${error}`, + hint: 'Fix the type error and run typecheck again', + })); + + if (issues.length === 0) { + issues.push({ + type: 'file', + severity: 'error', + message: 'Typecheck failed', + hint: `Run \`${typecheckCmd.command} ${typecheckCmd.args.join(' ')}\` to see full output`, + }); + } + + return { + passed: false, + phase: 'typecheck', + issues, + agentPrompt: formatTypecheckErrors(errors, output), + durationMs: Date.now() - startTime, + }; +} + +async function runBuildQuickCheck(projectDir: string, timeoutMs: number): Promise { + const startTime = Date.now(); + const buildCmd = await detectBuildCommand(projectDir); + + if (!buildCmd) { + return passResult('build', startTime); + } + + const { exitCode, stdout, stderr } = await spawnCommand(buildCmd.command, buildCmd.args, projectDir, timeoutMs); + + if (exitCode === 0) { + return passResult('build', startTime); + } + + const output = stdout + stderr; + const errors = parseBuildErrors(output); + const issues: ValidationIssue[] = + errors.length > 0 + ? errors.map((e) => ({ + type: 'file', + severity: 'error', + message: `Build error: ${e}`, + hint: 'Fix the error and run build again', + })) + : [ + { + type: 'file', + severity: 'error', + message: 'Build failed', + hint: `Run \`${buildCmd.command} ${buildCmd.args.join(' ')}\` to see full output`, + }, + ]; + + return { + passed: false, + phase: 'build', + issues, + agentPrompt: formatBuildErrors(issues), + durationMs: Date.now() - startTime, + }; +} + +interface TypecheckCommand { + command: string; + args: string[]; +} + +async function detectTypecheckCommand(projectDir: string): Promise { + const pm = detectPackageManager(projectDir); + + try { + const content = await readFile(join(projectDir, 'package.json'), 'utf-8'); + const pkg = JSON.parse(content) as { scripts?: Record }; + + const scriptName = pkg.scripts?.typecheck ? 'typecheck' : pkg.scripts?.['type-check'] ? 'type-check' : null; + if (scriptName) { + const args = pm === 'npm' ? ['run', scriptName] : [scriptName]; + return { command: pm, args }; + } + } catch { + // No package.json or malformed + } + + try { + await readFile(join(projectDir, 'tsconfig.json'), 'utf-8'); + return { command: 'npx', args: ['tsc', '--noEmit'] }; + } catch { + return null; + } +} + +function parseTypecheckErrors(output: string): string[] { + // Match both TS error formats: + // src/file.ts(line,col): error TS2345: ... + // src/file.ts:line:col - error TS2345: ... (tsc --pretty) + const pattern = /[\w./]+\.\w+(?:\(\d+,\d+\):\s*|:\d+:\d+\s*-\s*)error\s+TS\d+:.+/g; + const matches = output.match(pattern); + return matches ? [...new Set(matches)].slice(0, 10) : []; +} + +function formatTypecheckErrors(errors: string[], rawOutput: string): string { + if (errors.length === 0) { + // Couldn't parse specific errors — give raw output + const truncated = rawOutput.slice(0, 2000); + return `The typecheck failed. Here is the output:\n\n${truncated}\n\nFix the type errors shown above.`; + } + + const lines = errors.map((error) => { + // Extract file:line info and error description + const fileMatch = error.match(/([\w./]+\.\w+)[:(]\d+/); + const tsMatch = error.match(/error\s+(TS\d+):\s*(.+)/); + + if (fileMatch && tsMatch) { + return `- ${fileMatch[1]}: ${tsMatch[2]} (${tsMatch[1]})`; + } + return `- ${error}`; + }); + + return `The typecheck failed with ${errors.length} error${errors.length === 1 ? '' : 's'}:\n\n${lines.join('\n')}\n\nFix these type errors in the indicated files.`; +} + +function formatBuildErrors(issues: ValidationIssue[]): string { + const errorMessages = issues.map((i) => `- ${i.message}`); + return `The build failed:\n\n${errorMessages.join('\n')}\n\nFix these build errors.`; +} + +function formatForAgent(results: QuickCheckResult[]): string { + return results + .filter((r) => !r.passed && r.agentPrompt) + .map((r) => r.agentPrompt!) + .join('\n\n'); +} + +/** + * Validation callback suitable for RetryConfig.validateAndFormat. + * Returns null if checks pass, or an agent-ready error prompt if they fail. + */ +export async function quickCheckValidateAndFormat(workingDirectory: string): Promise { + const result = await runQuickChecks(workingDirectory); + return result.passed ? null : result.agentRetryPrompt; +} + +function spawnCommand( + command: string, + args: string[], + cwd: string, + timeoutMs: number, +): Promise<{ exitCode: number; stdout: string; stderr: string }> { + return new Promise((resolve) => { + const proc = spawn(command, args, { + cwd, + shell: true, + timeout: timeoutMs, + }); + + let stdout = ''; + let stderr = ''; + + proc.stdout?.on('data', (data: Buffer) => { + stdout += data.toString(); + }); + proc.stderr?.on('data', (data: Buffer) => { + stderr += data.toString(); + }); + + proc.on('close', (code) => { + resolve({ exitCode: code ?? 1, stdout, stderr }); + }); + + proc.on('error', () => { + resolve({ exitCode: 1, stdout, stderr }); + }); + }); +} diff --git a/src/lib/validation/types.ts b/src/lib/validation/types.ts index e3675fb..25a5ea5 100644 --- a/src/lib/validation/types.ts +++ b/src/lib/validation/types.ts @@ -47,5 +47,22 @@ export interface ValidationRules { variants?: Record; } +export interface QuickCheckResult { + passed: boolean; + phase: 'typecheck' | 'build'; + issues: ValidationIssue[]; + /** Formatted for agent consumption — actionable, not just error messages */ + agentPrompt: string | null; + durationMs: number; +} + +export interface QuickChecksOutput { + passed: boolean; + results: QuickCheckResult[]; + /** Combined agent-ready prompt summarizing all failures */ + agentRetryPrompt: string | null; + totalDurationMs: number; +} + // Re-export BuildResult from build-validator export type { BuildResult } from './build-validator.js'; diff --git a/src/lib/validation/validator.ts b/src/lib/validation/validator.ts index 772adba..6acaa43 100644 --- a/src/lib/validation/validator.ts +++ b/src/lib/validation/validator.ts @@ -30,12 +30,12 @@ export async function validateInstallation( } // Run validations - await validatePackages(rules, projectDir, issues); - await validateEnvVars(rules, projectDir, issues); - await validateFiles(rules, projectDir, issues); + issues.push(...(await validatePackages(rules, projectDir))); + issues.push(...(await validateEnvVars(rules, projectDir))); + issues.push(...(await validateFiles(rules, projectDir))); // Run framework-specific cross-validations - await validateFrameworkSpecific(framework, projectDir, issues); + issues.push(...(await validateFrameworkSpecific(framework, projectDir))); // Run build validation if enabled if (options.runBuild !== false) { @@ -74,16 +74,17 @@ async function loadRules(framework: string, variant?: string): Promise { +export async function validatePackages(rules: ValidationRules, projectDir: string): Promise { + const issues: ValidationIssue[] = []; const pkgPath = join(projectDir, 'package.json'); - if (!existsSync(pkgPath)) return; + if (!existsSync(pkgPath)) return issues; let pkg: Record; try { pkg = JSON.parse(await readFile(pkgPath, 'utf-8')); } catch { // Malformed package.json - skip package validation - return; + return issues; } const deps = (pkg.dependencies || {}) as Record; @@ -103,9 +104,12 @@ async function validatePackages(rules: ValidationRules, projectDir: string, issu }); } } + + return issues; } -async function validateEnvVars(rules: ValidationRules, projectDir: string, issues: ValidationIssue[]): Promise { +export async function validateEnvVars(rules: ValidationRules, projectDir: string): Promise { + const issues: ValidationIssue[] = []; const envPath = join(projectDir, '.env.local'); let envContent = ''; @@ -120,7 +124,7 @@ async function validateEnvVars(rules: ValidationRules, projectDir: string, issue hint: 'Create .env.local with required environment variables', }); } - return; + return issues; } for (const rule of rules.envVars) { @@ -144,9 +148,13 @@ async function validateEnvVars(rules: ValidationRules, projectDir: string, issue }); } } + + return issues; } -async function validateFiles(rules: ValidationRules, projectDir: string, issues: ValidationIssue[]): Promise { +export async function validateFiles(rules: ValidationRules, projectDir: string): Promise { + const issues: ValidationIssue[] = []; + for (const rule of rules.files) { let matches: string[]; try { @@ -205,16 +213,16 @@ async function validateFiles(rules: ValidationRules, projectDir: string, issues: } } } + + return issues; } /** * Framework-specific cross-validations that require reading multiple sources. */ -async function validateFrameworkSpecific( - framework: string, - projectDir: string, - issues: ValidationIssue[], -): Promise { +export async function validateFrameworkSpecific(framework: string, projectDir: string): Promise { + const issues: ValidationIssue[] = []; + // Universal cross-validations await validateCredentialFormats(projectDir, issues); await validateDuplicateEnvVars(projectDir, issues); @@ -238,6 +246,8 @@ async function validateFrameworkSpecific( await validateCookiePasswordLength(projectDir, issues, 'WORKOS_COOKIE_PASSWORD'); break; } + + return issues; } /** diff --git a/src/utils/types.ts b/src/utils/types.ts index cb54d76..901a05c 100644 --- a/src/utils/types.ts +++ b/src/utils/types.ts @@ -91,6 +91,13 @@ export type InstallerOptions = { * Requires ANTHROPIC_API_KEY environment variable. */ direct?: boolean; + + /** + * Max correction attempts after initial agent run. + * The agent gets this many chances to fix validation failures (typecheck/build). + * Default: 2. Set to 0 to disable retries entirely. + */ + maxRetries?: number; }; export interface Feature { diff --git a/tests/evals/__tests__/agent-executor.spec.ts b/tests/evals/__tests__/agent-executor.spec.ts new file mode 100644 index 0000000..62f057d --- /dev/null +++ b/tests/evals/__tests__/agent-executor.spec.ts @@ -0,0 +1,260 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { mkdtempSync, writeFileSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; + +// Hoist mocks so they're available in vi.mock factories +const { mockRunAgent, mockConfig, mockCredentials } = vi.hoisted(() => ({ + mockRunAgent: vi.fn(), + mockConfig: { + model: 'test-model', + workos: { clientId: 'client_test', authkitDomain: 'test.workos.com', llmGatewayUrl: 'http://localhost:8000' }, + telemetry: { enabled: false, eventName: 'test_event' }, + proxy: { refreshThresholdMs: 300000 }, + nodeVersion: '20', + logging: { debugMode: false }, + documentation: { + workosDocsUrl: 'https://workos.com/docs', + dashboardUrl: 'https://dashboard.workos.com', + issuesUrl: 'https://github.com', + }, + frameworks: {}, + legacy: { oauthPort: 3000 }, + branding: { showAsciiArt: false, asciiArt: '', compactAsciiArt: '', useCompact: false }, + }, + mockCredentials: { + workosApiKey: 'sk_test_key', + workosClientId: 'client_test_id', + anthropicApiKey: 'sk-ant-test', + }, +})); + +// Mock the production runAgent — this is what we're testing the wiring to +vi.mock('../../../src/lib/agent-interface.js', () => ({ + runAgent: mockRunAgent, +})); + +// Mock dependencies +vi.mock('../env-loader.js', () => ({ + loadCredentials: vi.fn(() => mockCredentials), +})); + +vi.mock('../../../src/lib/env-writer.js', () => ({ + writeEnvLocal: vi.fn(), +})); + +vi.mock('../../../src/utils/env-parser.js', () => ({ + parseEnvFile: vi.fn(() => ({})), +})); + +vi.mock('../../../src/lib/settings.js', () => ({ + getConfig: vi.fn(() => mockConfig), +})); + +vi.mock('../../../src/lib/validation/quick-checks.js', () => ({ + quickCheckValidateAndFormat: vi.fn(), +})); + +// Mock debug/analytics that agent-interface transitively imports +vi.mock('../../../src/utils/debug.js', () => ({ + debug: vi.fn(), + logInfo: vi.fn(), + logWarn: vi.fn(), + logError: vi.fn(), + initLogFile: vi.fn(), + getLogFilePath: vi.fn(() => null), +})); + +vi.mock('../../../src/utils/analytics.js', () => ({ + analytics: { + capture: vi.fn(), + setTag: vi.fn(), + shutdown: vi.fn(), + llmRequest: vi.fn(), + incrementAgentIterations: vi.fn(), + toolCalled: vi.fn(), + }, +})); + +import { AgentExecutor } from '../agent-executor.js'; +import { writeEnvLocal } from '../../../src/lib/env-writer.js'; + +describe('AgentExecutor', () => { + let testDir: string; + + beforeEach(() => { + testDir = mkdtempSync(join(tmpdir(), 'agent-executor-test-')); + // Create package.json so env writing works + writeFileSync(join(testDir, 'package.json'), JSON.stringify({ name: 'test' })); + mockRunAgent.mockReset(); + }); + + afterEach(() => { + rmSync(testDir, { recursive: true, force: true }); + }); + + it('calls production runAgent with correct AgentRunConfig', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run(); + + expect(mockRunAgent).toHaveBeenCalledTimes(1); + + const [agentRunConfig] = mockRunAgent.mock.calls[0]; + expect(agentRunConfig.workingDirectory).toBe(testDir); + expect(agentRunConfig.model).toBe('test-model'); + expect(agentRunConfig.allowedTools).toContain('Skill'); + expect(agentRunConfig.allowedTools).toContain('Write'); + expect(agentRunConfig.mcpServers).toHaveProperty('workos'); + // Direct mode — no gateway URL + expect(agentRunConfig.sdkEnv.ANTHROPIC_API_KEY).toBe('sk-ant-test'); + expect(agentRunConfig.sdkEnv.ANTHROPIC_BASE_URL).toBeUndefined(); + }); + + it('passes RetryConfig when correction is enabled', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run({ enabled: true, maxRetries: 3 }); + + const retryConfig = mockRunAgent.mock.calls[0][5]; // 6th arg + expect(retryConfig).toBeDefined(); + expect(retryConfig.maxRetries).toBe(3); + expect(typeof retryConfig.validateAndFormat).toBe('function'); + }); + + it('passes no RetryConfig when correction is disabled', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run({ enabled: false, maxRetries: 0 }); + + const retryConfig = mockRunAgent.mock.calls[0][5]; + expect(retryConfig).toBeUndefined(); + }); + + it('passes InstallerOptions with skipAuth=true', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run(); + + const installerOptions = mockRunAgent.mock.calls[0][2]; // 3rd arg + expect(installerOptions.skipAuth).toBe(true); + expect(installerOptions.installDir).toBe(testDir); + }); + + it('passes onMessage callback as 7th argument', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run(); + + const onMessage = mockRunAgent.mock.calls[0][6]; // 7th arg + expect(typeof onMessage).toBe('function'); + }); + + it('maps retryCount=0 to correctionAttempts=0, selfCorrected=false', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + const result = await executor.run(); + + expect(result.success).toBe(true); + expect(result.correctionAttempts).toBe(0); + expect(result.selfCorrected).toBe(false); + }); + + it('maps retryCount>0 to selfCorrected=true on success', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 2 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + const result = await executor.run(); + + expect(result.success).toBe(true); + expect(result.correctionAttempts).toBe(2); + expect(result.selfCorrected).toBe(true); + }); + + it('maps runAgent error result to failed AgentResult', async () => { + mockRunAgent.mockResolvedValue({ + error: 'EXECUTION_ERROR', + errorMessage: 'SDK crashed', + retryCount: 1, + }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + const result = await executor.run(); + + expect(result.success).toBe(false); + expect(result.error).toBe('SDK crashed'); + expect(result.correctionAttempts).toBe(1); + expect(result.selfCorrected).toBe(false); + }); + + it('handles runAgent throwing an exception', async () => { + mockRunAgent.mockRejectedValue(new Error('Connection refused')); + + const executor = new AgentExecutor(testDir, 'nextjs'); + const result = await executor.run(); + + expect(result.success).toBe(false); + expect(result.error).toBe('Connection refused'); + expect(result.correctionAttempts).toBe(0); + }); + + it('writes env vars before calling runAgent', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run(); + + expect(writeEnvLocal).toHaveBeenCalledWith(testDir, { + WORKOS_API_KEY: 'sk_test_key', + WORKOS_CLIENT_ID: 'client_test_id', + }); + }); + + it('onMessage callback collects text output from assistant messages', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run(); + + // Get the onMessage callback and simulate a message + const onMessage = mockRunAgent.mock.calls[0][6]; + onMessage({ + type: 'assistant', + message: { + content: [{ type: 'text', text: 'Installing AuthKit...' }], + }, + }); + + // Run again to verify output is collected (can't check internal state, + // but we can verify it doesn't throw) + expect(onMessage).toBeDefined(); + }); + + it('builds prompt with correct skill name for framework', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'react-router'); + await executor.run(); + + const prompt = mockRunAgent.mock.calls[0][1]; // 2nd arg + expect(prompt).toContain('workos-authkit-react-router'); + expect(prompt).toContain('react-router'); + }); + + it('defaults to correction enabled with maxRetries=2', async () => { + mockRunAgent.mockResolvedValue({ retryCount: 0 }); + + const executor = new AgentExecutor(testDir, 'nextjs'); + await executor.run(); // no retryConfig arg — uses default + + const retryConfig = mockRunAgent.mock.calls[0][5]; + expect(retryConfig).toBeDefined(); + expect(retryConfig.maxRetries).toBe(2); + }); +}); diff --git a/tests/evals/agent-executor.ts b/tests/evals/agent-executor.ts index 3c4b0cd..46c2a72 100644 --- a/tests/evals/agent-executor.ts +++ b/tests/evals/agent-executor.ts @@ -1,12 +1,13 @@ -import path from 'node:path'; import { writeFileSync, existsSync, readFileSync } from 'node:fs'; import { join } from 'node:path'; -import { fileURLToPath } from 'node:url'; import { loadCredentials } from './env-loader.js'; import { writeEnvLocal } from '../../src/lib/env-writer.js'; import { parseEnvFile } from '../../src/utils/env-parser.js'; import { getConfig } from '../../src/lib/settings.js'; import { LatencyTracker } from './latency-tracker.js'; +import { quickCheckValidateAndFormat } from '../../src/lib/validation/quick-checks.js'; +import { runAgent, type AgentRunConfig, type RetryConfig } from '../../src/lib/agent-interface.js'; +import type { InstallerOptions } from '../../src/utils/types.js'; import type { ToolCall, LatencyMetrics } from './types.js'; export interface AgentResult { @@ -15,6 +16,17 @@ export interface AgentResult { toolCalls: ToolCall[]; error?: string; latencyMetrics?: LatencyMetrics; + /** Number of within-session correction attempts */ + correctionAttempts: number; + /** Whether the agent self-corrected after an initial failure */ + selfCorrected: boolean; +} + +export interface AgentRetryConfig { + /** Enable within-session correction. Default: true */ + enabled: boolean; + /** Max correction attempts. Default: 2 */ + maxRetries: number; } export interface AgentExecutorOptions { @@ -77,20 +89,18 @@ export class AgentExecutor { this.latencyTracker = new LatencyTracker(); } - async run(): Promise { - const integration = this.getIntegration(); + async run(retryConfig?: AgentRetryConfig): Promise { + const config = retryConfig ?? { enabled: true, maxRetries: 2 }; const toolCalls: ToolCall[] = []; const collectedOutput: string[] = []; const label = this.options.scenarioName ? `[${this.options.scenarioName}]` : ''; if (this.options.verbose) { - console.log(`${label} Initializing agent for ${integration}...`); + console.log(`${label} Initializing agent for ${this.framework}...`); } - // Start latency tracking this.latencyTracker.start(); - // Write credentials to appropriate env file based on framework const envVars = { WORKOS_API_KEY: this.credentials.workosApiKey, WORKOS_CLIENT_ID: this.credentials.workosClientId, @@ -102,69 +112,74 @@ export class AgentExecutor { writeEnvFile(this.workDir, envVars); } - // Build prompt - const skillName = SKILL_NAMES[integration]; + const skillName = SKILL_NAMES[this.framework]; const prompt = this.buildPrompt(skillName); - // Initialize and run agent - try { - const { query } = await import('@anthropic-ai/claude-agent-sdk'); - - // Build SDK environment for direct mode - const sdkEnv: Record = { - ...process.env, - ANTHROPIC_API_KEY: this.credentials.anthropicApiKey, - CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: 'true', - CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: 'true', - }; - // Remove gateway config to use direct API - delete sdkEnv.ANTHROPIC_BASE_URL; - delete sdkEnv.ANTHROPIC_AUTH_TOKEN; - - // Get plugin path for skills - const __filename = fileURLToPath(import.meta.url); - const __dirname = path.dirname(__filename); - const pluginPath = path.join(__dirname, '../..'); - - const response = query({ - prompt: prompt, - options: { - model: getConfig().model, - cwd: this.workDir, - permissionMode: 'acceptEdits', - mcpServers: { - workos: { - command: 'npx', - args: ['-y', '@workos/mcp-docs-server'], - }, - }, - env: sdkEnv, - tools: { type: 'preset', preset: 'claude_code' }, - allowedTools: ['Skill', 'Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep', 'WebFetch'], - plugins: [{ type: 'local', path: pluginPath }], + const sdkEnv: Record = { + ...process.env, + ANTHROPIC_API_KEY: this.credentials.anthropicApiKey, + ANTHROPIC_BASE_URL: undefined, + ANTHROPIC_AUTH_TOKEN: undefined, + CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: 'true', + CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: 'true', + }; + + const agentRunConfig: AgentRunConfig = { + workingDirectory: this.workDir, + mcpServers: { + workos: { + command: 'npx', + args: ['-y', '@workos/mcp-docs-server'], }, - }); + }, + model: getConfig().model, + allowedTools: ['Skill', 'Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep', 'WebFetch'], + sdkEnv, + }; - // Process message stream - for await (const message of response) { - this.handleMessage(message, toolCalls, collectedOutput, label); - } + const installerOptions: InstallerOptions = { + debug: this.options.verbose ?? false, + forceInstall: false, + installDir: this.workDir, + local: false, + ci: true, + skipAuth: true, + }; + + const prodRetryConfig: RetryConfig | undefined = config.enabled + ? { maxRetries: config.maxRetries, validateAndFormat: quickCheckValidateAndFormat } + : undefined; + + try { + // Delegate to production runAgent — same retry loop, same generator coordination + const result = await runAgent( + agentRunConfig, + prompt, + installerOptions, + undefined, // no spinner config + undefined, // no emitter + prodRetryConfig, + (message) => this.trackMessage(message, toolCalls, collectedOutput, label), + ); const latencyMetrics = this.latencyTracker.finish(); - return { - success: true, - output: collectedOutput.join('\n'), - toolCalls, - latencyMetrics, - }; + const correctionAttempts = result.retryCount ?? 0; + const base = { output: collectedOutput.join('\n'), toolCalls, latencyMetrics, correctionAttempts }; + + if (result.error) { + return { ...base, success: false, error: result.errorMessage ?? String(result.error), selfCorrected: false }; + } + + return { ...base, success: true, selfCorrected: correctionAttempts > 0 }; } catch (error) { - const latencyMetrics = this.latencyTracker.finish(); return { success: false, output: collectedOutput.join('\n'), toolCalls, - latencyMetrics, + latencyMetrics: this.latencyTracker.finish(), error: error instanceof Error ? error.message : String(error), + correctionAttempts: 0, + selfCorrected: false, }; } } @@ -187,15 +202,17 @@ Use the \`${skillName}\` skill to integrate WorkOS AuthKit into this application Begin by invoking the ${skillName} skill.`; } - private handleMessage(message: any, toolCalls: ToolCall[], collectedOutput: string[], label: string): void { + /** + * Observe SDK messages for latency tracking and output collection. + * This is called via the onMessage hook — production handleSDKMessage runs first. + */ + private trackMessage(message: any, toolCalls: ToolCall[], collectedOutput: string[], label: string): void { if (message.type === 'assistant') { - // End any in-progress tool call when we get a new assistant message this.latencyTracker.endToolCall(); const content = message.message?.content; if (Array.isArray(content)) { for (const block of content) { - // Capture text output and track TTFT if (block.type === 'text' && typeof block.text === 'string') { this.latencyTracker.recordFirstContent(); collectedOutput.push(block.text); @@ -203,14 +220,12 @@ Begin by invoking the ${skillName} skill.`; console.log(`${label} Agent: ${block.text.slice(0, 100)}...`); } } - // Capture tool calls and start timing if (block.type === 'tool_use') { this.latencyTracker.startToolCall(block.name); - const call: ToolCall = { + toolCalls.push({ tool: block.name, input: block.input as Record, - }; - toolCalls.push(call); + }); if (this.options.verbose) { console.log(`${label} Tool: ${block.name}`); } @@ -220,7 +235,6 @@ Begin by invoking the ${skillName} skill.`; } if (message.type === 'result') { - // Capture token usage from result if (message.usage) { this.latencyTracker.recordTokens(message.usage.input_tokens ?? 0, message.usage.output_tokens ?? 0); } @@ -229,9 +243,4 @@ Begin by invoking the ${skillName} skill.`; } } } - - private getIntegration(): string { - // Integration is now a string type — framework name IS the integration name - return this.framework; - } } diff --git a/tests/evals/cli.ts b/tests/evals/cli.ts index 757a959..12a4cca 100644 --- a/tests/evals/cli.ts +++ b/tests/evals/cli.ts @@ -12,6 +12,7 @@ export interface CliOptions { sequential: boolean; noDashboard: boolean; noFail: boolean; + noCorrection: boolean; quality: boolean; command?: 'run' | 'history' | 'compare' | 'diff' | 'prune' | 'logs' | 'show'; compareIds?: [string, string]; @@ -61,6 +62,7 @@ export function parseArgs(args: string[]): CliOptions { sequential: false, noDashboard: false, noFail: false, + noCorrection: false, quality: false, }; @@ -144,6 +146,8 @@ export function parseArgs(args: string[]): CliOptions { options.noDashboard = true; } else if (arg === '--no-fail') { options.noFail = true; + } else if (arg === '--no-correction') { + options.noCorrection = true; } else if (arg === '--quality' || arg === '-q') { options.quality = true; } @@ -193,6 +197,8 @@ Options: --no-fail Exit 0 even if success criteria thresholds not met + --no-correction Disable within-session agent self-correction retries + --quality, -q Enable LLM-based quality grading (adds cost/time) --json Output results as JSON (for scripting) diff --git a/tests/evals/graders/quality-grader.ts b/tests/evals/graders/quality-grader.ts index 91165a4..22d1bf6 100644 --- a/tests/evals/graders/quality-grader.ts +++ b/tests/evals/graders/quality-grader.ts @@ -88,8 +88,9 @@ Then, output your final scores as JSON. const thinkingMatch = text.match(/([\s\S]*?)<\/thinking>/); const reasoning = thinkingMatch?.[1]?.trim() || 'No reasoning provided'; - // Extract JSON scores (after thinking block) - const jsonMatch = text.match(/\{[\s\S]*\}/); + // Extract JSON scores — look after tag to avoid matching braces in reasoning + const afterThinking = thinkingMatch ? text.slice(text.indexOf('') + ''.length) : text; + const jsonMatch = afterThinking.match(/\{[^{}]*\}/); if (!jsonMatch) return null; const parsed = JSON.parse(jsonMatch[0]) as Record; diff --git a/tests/evals/index.ts b/tests/evals/index.ts index 7e92274..118f3a2 100644 --- a/tests/evals/index.ts +++ b/tests/evals/index.ts @@ -60,6 +60,7 @@ async function main() { noDashboard: options.noDashboard, debug: options.debug, noFail: options.noFail, + noCorrection: options.noCorrection, quality: options.quality, }); diff --git a/tests/evals/parallel-runner.ts b/tests/evals/parallel-runner.ts index 4bf2f35..0af0074 100644 --- a/tests/evals/parallel-runner.ts +++ b/tests/evals/parallel-runner.ts @@ -18,6 +18,7 @@ interface ParallelRunnerOptions { keep?: boolean; keepOnFail?: boolean; concurrency?: number; // Override auto-detection + noCorrection?: boolean; } export class ParallelRunner { @@ -125,7 +126,9 @@ export class ParallelRunner { verbose: this.options.verbose, scenarioName, }); - const agentResult = await executor.run(); + const agentResult = await executor.run( + this.options.noCorrection ? { enabled: false, maxRetries: 0 } : undefined, + ); lastToolCalls = agentResult.toolCalls; const grader = new scenario.grader(workDir); @@ -143,6 +146,8 @@ export class ParallelRunner { attempts: attempt, latencyMetrics: agentResult.latencyMetrics, keyFiles, + correctionAttempts: agentResult.correctionAttempts, + selfCorrected: agentResult.selfCorrected, }; if (gradeResult.passed) { @@ -181,11 +186,7 @@ export class ParallelRunner { if (lastResult && !lastResult.passed) { console.log(`✗ ${scenarioName} FAILED`); - if (!this.options.verbose) { - this.printFailureDetails(lastResult, false); - } else { - this.printFailureDetails(lastResult, true); - } + this.printFailureDetails(lastResult, !!this.options.verbose); evalEvents.emitScenarioFail({ scenario: scenarioName, framework: scenario.framework, diff --git a/tests/evals/reporter.ts b/tests/evals/reporter.ts index 4cd1b4f..316dd7e 100644 --- a/tests/evals/reporter.ts +++ b/tests/evals/reporter.ts @@ -59,7 +59,10 @@ export function printMatrix(results: EvalResult[]): void { const passed = results.filter((r) => r.passed).length; const total = results.length; const rate = ((passed / total) * 100).toFixed(1); - console.log(`\nResults: ${passed}/${total} passed (${rate}%)`); + const selfCorrected = results.filter((r) => r.selfCorrected).length; + console.log( + `\nResults: ${passed}/${total} passed (${rate}%)${selfCorrected > 0 ? `, ${selfCorrected} self-corrected` : ''}`, + ); if (passed < total) { console.log('\nFailed scenarios:'); diff --git a/tests/evals/runner.ts b/tests/evals/runner.ts index c48db6d..5375ea3 100644 --- a/tests/evals/runner.ts +++ b/tests/evals/runner.ts @@ -87,8 +87,8 @@ const SCENARIOS: Scenario[] = [ { framework: 'elixir', state: 'example', grader: ElixirGrader }, { framework: 'elixir', state: 'example-auth0', grader: ElixirGrader }, - // .NET (broken — no runtime) - { framework: 'dotnet', state: 'example', grader: DotnetGrader }, + // .NET (disabled — SDK is broken and no runtime available on most machines) + // { framework: 'dotnet', state: 'example', grader: DotnetGrader }, ]; export interface ExtendedEvalOptions extends EvalOptions { @@ -98,6 +98,7 @@ export interface ExtendedEvalOptions extends EvalOptions { noDashboard?: boolean; debug?: boolean; noFail?: boolean; + noCorrection?: boolean; quality?: boolean; } @@ -122,6 +123,7 @@ export async function runEvals(options: ExtendedEvalOptions): Promise { describe('DEFAULT_CRITERIA', () => { it('has expected default thresholds', () => { - expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.9); + expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.8); + expect(DEFAULT_CRITERIA.withCorrectionPassRate).toBe(0.9); expect(DEFAULT_CRITERIA.withRetryPassRate).toBe(0.95); }); }); describe('validateResults', () => { it('returns passed=true when all criteria met', () => { - // 10 results, 9 passed on first attempt, 1 passed on retry + // 10 results: 8 clean (80%), 1 corrected (9/10 = 90% correction), 1 retried (100% retry) const results: EvalResult[] = [ - ...Array(9) + ...Array(8) .fill(null) - .map(() => makeResult(true, 1)), + .map(() => makeResult(true, 1, 0)), + makeResult(true, 1, 1), makeResult(true, 2), ]; @@ -33,34 +36,36 @@ describe('success-criteria', () => { expect(validation.passed).toBe(true); expect(validation.failures).toHaveLength(0); - expect(validation.actual.firstAttemptPassRate).toBe(0.9); + expect(validation.actual.firstAttemptPassRate).toBe(0.8); + expect(validation.actual.withCorrectionPassRate).toBe(0.9); expect(validation.actual.withRetryPassRate).toBe(1); }); it('returns passed=false when first-attempt rate below threshold', () => { - // 10 results, only 8 passed on first attempt + // 10 results: 7 clean (70% < 80%), 2 corrected (90% correction), 1 retried const results: EvalResult[] = [ - ...Array(8) + ...Array(7) .fill(null) - .map(() => makeResult(true, 1)), - makeResult(true, 2), + .map(() => makeResult(true, 1, 0)), + ...Array(2) + .fill(null) + .map(() => makeResult(true, 1, 1)), makeResult(true, 2), ]; const validation = validateResults(results); expect(validation.passed).toBe(false); - expect(validation.failures).toHaveLength(1); - expect(validation.failures[0]).toContain('First-attempt'); - expect(validation.failures[0]).toContain('80.0%'); + expect(validation.failures.some((f) => f.includes('First-attempt'))).toBe(true); }); it('returns passed=false when with-retry rate below threshold', () => { - // 10 results, 9 passed first attempt, 1 failed entirely + // 10 results: 8 clean (80%), 1 corrected (90% correction), 1 failed → 90% retry < 95% const results: EvalResult[] = [ - ...Array(9) + ...Array(8) .fill(null) - .map(() => makeResult(true, 1)), + .map(() => makeResult(true, 1, 0)), + makeResult(true, 1, 1), makeResult(false, 3), ]; @@ -71,21 +76,24 @@ describe('success-criteria', () => { expect(validation.failures[0]).toContain('With-retry'); }); - it('returns both failures when both criteria not met', () => { - // 10 results, 7 passed first attempt, 1 failed + it('returns both failures when multiple criteria not met', () => { + // 10 results: 2 clean (20% < 50%), 4 corrected, 4 failed (60% < 95% retry) const results: EvalResult[] = [ - ...Array(7) + ...Array(2) .fill(null) - .map(() => makeResult(true, 1)), - makeResult(true, 2), - makeResult(true, 2), - makeResult(false, 3), + .map(() => makeResult(true, 1, 0)), + ...Array(4) + .fill(null) + .map(() => makeResult(true, 1, 1)), + ...Array(4) + .fill(null) + .map(() => makeResult(false, 3)), ]; const validation = validateResults(results); expect(validation.passed).toBe(false); - expect(validation.failures).toHaveLength(2); + expect(validation.failures.length).toBeGreaterThanOrEqual(2); }); it('handles empty results array', () => { @@ -120,11 +128,18 @@ describe('success-criteria', () => { }); it('passes when exactly at threshold', () => { - // Exactly 90% first-attempt, 95% with-retry + // 20 results: + // 16 clean first-attempt (attempt=1, corrections=0) → 80% first-attempt + // 2 self-corrected (attempt=1, corrections=1) → 18/20 = 90% with-correction + // 1 passed on scenario retry (attempt=2) → 19/20 = 95% with-retry + // 1 failed (attempt=3) const results: EvalResult[] = [ - ...Array(18) + ...Array(16) .fill(null) - .map(() => makeResult(true, 1)), + .map(() => makeResult(true, 1, 0)), + ...Array(2) + .fill(null) + .map(() => makeResult(true, 1, 1)), makeResult(true, 2), makeResult(false, 3), ]; diff --git a/tests/evals/success-criteria.ts b/tests/evals/success-criteria.ts index f6b3cb7..6805ed0 100644 --- a/tests/evals/success-criteria.ts +++ b/tests/evals/success-criteria.ts @@ -7,7 +7,9 @@ import type { EvalResult } from './types.js'; export interface SuccessCriteria { /** Minimum pass rate on first attempt (0-1) */ firstAttemptPassRate: number; - /** Minimum pass rate with retries (0-1) */ + /** Minimum pass rate after within-session correction (0-1) */ + withCorrectionPassRate?: number; + /** Minimum pass rate with full scenario retries (0-1) */ withRetryPassRate: number; /** Maximum duration per scenario in ms (optional, for future use) */ maxDurationMs?: number; @@ -15,7 +17,8 @@ export interface SuccessCriteria { /** Default thresholds for CI enforcement */ export const DEFAULT_CRITERIA: SuccessCriteria = { - firstAttemptPassRate: 0.9, + firstAttemptPassRate: 0.8, + withCorrectionPassRate: 0.9, withRetryPassRate: 0.95, }; @@ -24,6 +27,7 @@ export interface ValidationResult { criteria: SuccessCriteria; actual: { firstAttemptPassRate: number; + withCorrectionPassRate: number; withRetryPassRate: number; }; failures: string[]; @@ -34,10 +38,16 @@ export interface ValidationResult { * Returns detailed breakdown of pass/fail status with actionable messages. */ export function validateResults(results: EvalResult[], criteria: SuccessCriteria = DEFAULT_CRITERIA): ValidationResult { - const firstAttemptPassed = results.filter((r) => r.attempts === 1 && r.passed).length; + // First attempt: passed on first scenario attempt with no corrections + const firstAttemptPassed = results.filter( + (r) => r.attempts === 1 && r.passed && (r.correctionAttempts ?? 0) === 0, + ).length; + // With correction: passed on first scenario attempt (may have used within-session correction) + const withCorrectionPassed = results.filter((r) => r.attempts === 1 && r.passed).length; const totalPassed = results.filter((r) => r.passed).length; const firstAttemptRate = results.length > 0 ? firstAttemptPassed / results.length : 0; + const withCorrectionRate = results.length > 0 ? withCorrectionPassed / results.length : 0; const withRetryRate = results.length > 0 ? totalPassed / results.length : 0; const failures: string[] = []; @@ -46,6 +56,11 @@ export function validateResults(results: EvalResult[], criteria: SuccessCriteria `First-attempt pass rate ${(firstAttemptRate * 100).toFixed(1)}% < ${criteria.firstAttemptPassRate * 100}% required`, ); } + if (criteria.withCorrectionPassRate !== undefined && withCorrectionRate < criteria.withCorrectionPassRate) { + failures.push( + `With-correction pass rate ${(withCorrectionRate * 100).toFixed(1)}% < ${criteria.withCorrectionPassRate * 100}% required`, + ); + } if (withRetryRate < criteria.withRetryPassRate) { failures.push( `With-retry pass rate ${(withRetryRate * 100).toFixed(1)}% < ${criteria.withRetryPassRate * 100}% required`, @@ -55,7 +70,11 @@ export function validateResults(results: EvalResult[], criteria: SuccessCriteria return { passed: failures.length === 0, criteria, - actual: { firstAttemptPassRate: firstAttemptRate, withRetryPassRate: withRetryRate }, + actual: { + firstAttemptPassRate: firstAttemptRate, + withCorrectionPassRate: withCorrectionRate, + withRetryPassRate: withRetryRate, + }, failures, }; } diff --git a/tests/evals/types.ts b/tests/evals/types.ts index 3f626d2..9891597 100644 --- a/tests/evals/types.ts +++ b/tests/evals/types.ts @@ -27,6 +27,10 @@ export interface EvalResult { qualityGrade?: QualityGrade; /** Key integration files for quality grading (replaces raw diff) */ keyFiles?: Map; + /** Within-session correction attempts (0 = passed first try) */ + correctionAttempts?: number; + /** Agent self-corrected after initial failure */ + selfCorrected?: boolean; } /** Input for quality grading - structured data instead of raw diff */