diff --git a/docs/design/telemetry-llm-request-timing-design.md b/docs/design/telemetry-llm-request-timing-design.md index 4a41b082d16..e1b1b06eb9f 100644 --- a/docs/design/telemetry-llm-request-timing-design.md +++ b/docs/design/telemetry-llm-request-timing-design.md @@ -126,7 +126,13 @@ When `attempt === 1` and no retries happened, `request_setup_ms` is small (just 2. **Single-trace debug** — operator sees `duration_ms=12000, request_setup_ms=11500, ttft_ms=200, sampling_ms=300` → instantly diagnoses "retries ate 11.5s, model itself was fast." Computing `request_setup_ms` from other fields requires also exposing `sampling_ms`, which we do anyway (D6). 3. **Negligible cost** — 1 INT64 attribute. Same order of magnitude as the existing `input_tokens`, `output_tokens` attributes. Backend ingest cost is not material. -### D4 — Retry telemetry: `onRetry` callback option on `retryWithBackoff` + new `ApiRetryEvent` +### D4 — Retry telemetry: `onRetry` callback option on `retryWithBackoff` + `ApiRetryEvent` + AsyncLocalStorage propagation + +> **Phase 4b update (post-design discovery)**: this section was originally written assuming claude-code's "one LLM span owns the retry loop" pattern. While implementing Phase 4b, we discovered that qwen-code's 4 `retryWithBackoff` call sites (`client.ts:2109`, `baseLlmClient.ts:235,333`, `geminiChat.ts:2035` — line numbers as of merge) all wrap `apiCall = () => contentGenerator.generateContent(...)`. The retry layer sits **above** LoggingContentGenerator. Each retry attempt invokes `apiCall()` fresh → fresh `qwen-code.llm_request` span. There is no single shared span across attempts. An in-`LoggingContentGenerator` accumulator wouldn't work. +> +> **Resolution**: propagate retry state via `AsyncLocalStorage` (`retryContext` in `packages/core/src/utils/retryContext.ts`). `retryWithBackoff` wraps each `await fn()` in `retryContext.run({ attempt, requestSetupMs, retryTotalDelayMs }, fn)`. `LoggingContentGenerator` reads the ALS in its synchronous prelude and forwards the values to `endLLMRequestSpan`. This actually gives **richer** observability than the original plan — each per-attempt span has its own `duration_ms` / `ttft_ms` / error details AND knows where in the retry budget it sits via the per-attempt `attempt` / `requestSetupMs` / `retryTotalDelayMs` attributes. +> +> The ALS approach matches existing patterns in the codebase (`promptIdContext`, `subagentNameContext`, `agent-context`) — minimal new surface, well-understood semantics. Plan-mode review process captured this revision through 3 review rounds finding 22 issues, all addressed before merge. `retryWithBackoff` currently calls `logRetryAttempt` (`retry.ts:343`) which only writes to `debugLogger.warn`. We extend the `RetryOptions` interface with an opt-in callback: @@ -188,14 +194,14 @@ export class ApiRetryEvent implements BaseTelemetryEvent { OTel span attributes are scalars (`string | number | boolean | array of these`). Map-typed attributes (like `retry_count_by_status: {429:2, 503:1}`) require JSON serialization and are awkward to query. Skip them. -| Attribute | Type | Semantic | -| -------------------------- | ------ | ----------------------------------------------------------------------------------- | -| `attempt` | int | 1-based final attempt count (`attemptStartTimes.length`) | -| `retry_total_delay_ms` | int | Sum of all `delayMs` reported by `onRetry`; 0 if no retries | -| `ttft_ms` | int | TTFT per D1; undefined for non-streaming or aborted-before-first-chunk requests | -| `request_setup_ms` | int | Per D3 | -| `sampling_ms` | int | Per D6 | -| `output_tokens_per_second` | double | Derived; `output_tokens / (sampling_ms / 1000)`; undefined when `sampling_ms === 0` | +| Attribute | Type | Semantic | +| -------------------------- | ------ | ---------------------------------------------------------------------------------------------------------------------------------------- | +| `attempt` | int | 1-based monotonic counter from `retryContext.attempt` (this attempt's iteration). Always populated (defaults to 1 when no retry context) | +| `retry_total_delay_ms` | int | Cumulative backoff sleep BEFORE this attempt started. Undefined for direct calls; 0 for attempt 1; > 0 for subsequent retried attempts | +| `ttft_ms` | int | TTFT per D1; undefined for non-streaming or aborted-before-first-chunk requests | +| `request_setup_ms` | int | Per D3 | +| `sampling_ms` | int | Per D6 | +| `output_tokens_per_second` | double | Derived; `output_tokens / (sampling_ms / 1000)`; undefined when `sampling_ms === 0` | Per-attempt status-code distribution (e.g., "2 of the 3 attempts were 429s") is queryable from log-bridge spans of `ApiRetryEvent` records. No need to duplicate it as a flattened attribute on the parent. diff --git a/packages/core/src/core/baseLlmClient.ts b/packages/core/src/core/baseLlmClient.ts index b2e56596b99..bda5eb854cb 100644 --- a/packages/core/src/core/baseLlmClient.ts +++ b/packages/core/src/core/baseLlmClient.ts @@ -27,6 +27,9 @@ import { import { reportError } from '../utils/errorReporting.js'; import { getErrorMessage } from '../utils/errors.js'; import { retryWithBackoff, isUnattendedMode } from '../utils/retry.js'; +import { subagentNameContext } from '../utils/subagentNameContext.js'; +import { ApiRetryEvent } from '../telemetry/types.js'; +import { logApiRetry } from '../telemetry/loggers.js'; import { getFunctionCalls } from '../utils/generateContentResponseUtilities.js'; import { getResponseText } from '../utils/partUtils.js'; import { createDebugLogger } from '../utils/debugLogger.js'; @@ -242,6 +245,20 @@ export class BaseLlmClient { `[qwen-code] Waiting for API capacity... attempt ${info.attempt}, retry in ${Math.ceil(info.remainingMs / 1000)}s\n`, ); }, + onRetry: (info) => { + logApiRetry( + this.config, + new ApiRetryEvent({ + model: requestModel, + promptId, + attemptNumber: info.attempt, + error: info.error, + statusCode: info.errorStatus, + retryDelayMs: info.delayMs, + subagentName: subagentNameContext.getStore(), + }), + ); + }, }); const functionCalls = getFunctionCalls(result); @@ -340,6 +357,20 @@ export class BaseLlmClient { `[qwen-code] Waiting for API capacity... attempt ${info.attempt}, retry in ${Math.ceil(info.remainingMs / 1000)}s\n`, ); }, + onRetry: (info) => { + logApiRetry( + this.config, + new ApiRetryEvent({ + model: requestModel, + promptId, + attemptNumber: info.attempt, + error: info.error, + statusCode: info.errorStatus, + retryDelayMs: info.delayMs, + subagentName: subagentNameContext.getStore(), + }), + ); + }, }); return { diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts index 103f4e8b889..46f4c22866d 100644 --- a/packages/core/src/core/client.ts +++ b/packages/core/src/core/client.ts @@ -94,7 +94,10 @@ import { } from '../utils/partUtils.js'; import { promptIdContext } from '../utils/promptIdContext.js'; import { retryWithBackoff, isUnattendedMode } from '../utils/retry.js'; +import { subagentNameContext } from '../utils/subagentNameContext.js'; import { escapeSystemReminderTags } from '../utils/xml.js'; +import { ApiRetryEvent } from '../telemetry/types.js'; +import { logApiRetry } from '../telemetry/loggers.js'; // Hook types and utilities import { @@ -2096,6 +2099,24 @@ export class GeminiClient { `[qwen-code] Waiting for API capacity... attempt ${info.attempt}, retry in ${Math.ceil(info.remainingMs / 1000)}s\n`, ); }, + // Phase 4b — emit ApiRetryEvent telemetry for HTTP-status retries. + // subagent_name read from subagentNameContext (active in catch block + // since the entire generateContent invocation runs inside the parent + // subagent's ALS frame when applicable). + onRetry: (info) => { + logApiRetry( + this.config, + new ApiRetryEvent({ + model: currentAttemptModel, + promptId, + attemptNumber: info.attempt, + error: info.error, + statusCode: info.errorStatus, + retryDelayMs: info.delayMs, + subagentName: subagentNameContext.getStore(), + }), + ); + }, }); return result; } catch (error: unknown) { diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts index 9d1ec8b44cf..6c17eec1272 100644 --- a/packages/core/src/core/geminiChat.ts +++ b/packages/core/src/core/geminiChat.ts @@ -40,8 +40,10 @@ import type { StructuredError } from './turn.js'; import { logContentRetry, logContentRetryFailure, + logApiRetry, } from '../telemetry/loggers.js'; import { clearDetailedSpanState } from '../telemetry/detailed-span-attributes.js'; +import { subagentNameContext } from '../utils/subagentNameContext.js'; import { type ChatRecordingService } from '../services/chatRecordingService.js'; import { ChatCompressionService, @@ -54,6 +56,7 @@ import { estimatePromptTokens } from '../services/tokenEstimation.js'; import { ContentRetryEvent, ContentRetryFailureEvent, + ApiRetryEvent, } from '../telemetry/types.js'; import type { UiTelemetryService } from '../telemetry/uiTelemetry.js'; import { type ChatCompressionInfo, CompressionStatus } from './turn.js'; @@ -2346,6 +2349,20 @@ export class GeminiChat { `[qwen-code] Waiting for API capacity... attempt ${info.attempt}, retry in ${Math.ceil(info.remainingMs / 1000)}s\n`, ); }, + onRetry: (info) => { + logApiRetry( + this.config, + new ApiRetryEvent({ + model, + promptId: prompt_id, + attemptNumber: info.attempt, + error: info.error, + statusCode: info.errorStatus, + retryDelayMs: info.delayMs, + subagentName: subagentNameContext.getStore(), + }), + ); + }, }); return this.processStreamResponse(model, streamResponse); diff --git a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.test.ts b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.test.ts index 570cc9ae59c..bfaa49b27ff 100644 --- a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.test.ts +++ b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.test.ts @@ -40,6 +40,11 @@ const loggingSpanRecords = vi.hoisted( success?: boolean; inputTokens?: number; outputTokens?: number; + cachedInputTokens?: number; + ttftMs?: number; + requestSetupMs?: number; + attempt?: number; + retryTotalDelayMs?: number; durationMs?: number; error?: string; }; @@ -2040,3 +2045,265 @@ describe('LoggingContentGenerator', () => { }, ); }); + +// ========================================================================= +// Phase 4b — retryContext ALS propagation into LoggingContentGenerator. +// Asserts the contract: when the LLM call runs inside a retryContext.run() +// frame, endLLMRequestSpan receives the frame's values. When no frame is +// present (warmup, side-query, direct call), `attempt` defaults to 1 and +// requestSetupMs/retryTotalDelayMs stay undefined. +// ========================================================================= +describe('LoggingContentGenerator — Phase 4b retry context propagation', () => { + beforeEach(() => { + loggingSpanRecords.length = 0; + vi.mocked(logApiRequest).mockClear(); + vi.mocked(logApiResponse).mockClear(); + vi.mocked(logApiError).mockClear(); + }); + + it('non-stream: forwards retryContext.attempt/requestSetupMs/retryTotalDelayMs to endLLMRequestSpan', async () => { + const { retryContext } = await import('../../utils/retryContext.js'); + + const wrapped = createWrappedGenerator( + vi.fn().mockResolvedValue( + createResponse('r-1', 'test-model', [{ text: 'ok' }], { + promptTokenCount: 10, + candidatesTokenCount: 5, + totalTokenCount: 15, + }), + ), + vi.fn(), + ); + const generator = new LoggingContentGenerator(wrapped, createConfig(), { + model: 'test-model', + authType: AuthType.USE_OPENAI, + enableOpenAILogging: false, + }); + const request = { + model: 'test-model', + contents: 'Hello', + } as unknown as GenerateContentParameters; + + // Simulate being invoked from within `retryWithBackoff`'s ALS frame — + // the LoggingContentGenerator must read these values and forward them. + await retryContext.run( + { attempt: 3, requestSetupMs: 1200, retryTotalDelayMs: 1000 }, + async () => { + await generator.generateContent(request, 'prompt-retry'); + }, + ); + + const record = getGenerateContentSpanRecord(); + expect(record.endMetadata).toMatchObject({ + success: true, + attempt: 3, + requestSetupMs: 1200, + retryTotalDelayMs: 1000, + }); + }); + + it('non-stream: defaults attempt=1 and leaves setup/delay undefined when no retry context (direct call / warmup)', async () => { + const wrapped = createWrappedGenerator( + vi.fn().mockResolvedValue( + createResponse('r-2', 'test-model', [{ text: 'ok' }], { + promptTokenCount: 10, + candidatesTokenCount: 5, + totalTokenCount: 15, + }), + ), + vi.fn(), + ); + const generator = new LoggingContentGenerator(wrapped, createConfig(), { + model: 'test-model', + authType: AuthType.USE_OPENAI, + enableOpenAILogging: false, + }); + const request = { + model: 'test-model', + contents: 'Hello', + } as unknown as GenerateContentParameters; + + // No retryContext.run() — direct invocation. + await generator.generateContent(request, 'prompt-direct'); + + const record = getGenerateContentSpanRecord(); + const meta = record.endMetadata as { + attempt?: number; + requestSetupMs?: number; + retryTotalDelayMs?: number; + }; + expect(meta.attempt).toBe(1); + expect(meta.requestSetupMs).toBeUndefined(); + expect(meta.retryTotalDelayMs).toBeUndefined(); + }); + + it('stream: snapshots retry context in synchronous prelude and forwards through stream wrapper finally block', async () => { + const { retryContext } = await import('../../utils/retryContext.js'); + + const streamFn = vi.fn().mockResolvedValue( + (async function* () { + yield createResponse('r-s1', 'test-model', [{ text: 'a' }]); + yield createResponse('r-s2', 'test-model', [{ text: 'b' }], { + promptTokenCount: 50, + candidatesTokenCount: 20, + totalTokenCount: 70, + }); + })(), + ); + const wrapped = createWrappedGenerator(vi.fn(), streamFn); + const generator = new LoggingContentGenerator(wrapped, createConfig(), { + model: 'test-model', + authType: AuthType.USE_OPENAI, + enableOpenAILogging: false, + }); + const request = { + model: 'test-model', + contents: 'Hello', + } as unknown as GenerateContentParameters; + + // Critical: the stream wrapper is iterated AFTER retryContext.run resolves + // its synchronous body. The closure-captured snapshot must carry values + // through to the finally block's endLLMRequestSpan call. + await retryContext.run( + { attempt: 2, requestSetupMs: 500, retryTotalDelayMs: 400 }, + async () => { + const stream = await generator.generateContentStream( + request, + 'prompt-retry-stream', + ); + for await (const _ of stream) { + // consume + } + }, + ); + + const record = getStreamSpanRecord(); + expect(record.endMetadata).toMatchObject({ + success: true, + attempt: 2, + requestSetupMs: 500, + retryTotalDelayMs: 400, + inputTokens: 50, + outputTokens: 20, + }); + }); + + it('stream: defaults attempt=1 when iterated outside any retry frame', async () => { + const streamFn = vi.fn().mockResolvedValue( + (async function* () { + yield createResponse('r-s3', 'test-model', [{ text: 'a' }]); + })(), + ); + const wrapped = createWrappedGenerator(vi.fn(), streamFn); + const generator = new LoggingContentGenerator(wrapped, createConfig(), { + model: 'test-model', + authType: AuthType.USE_OPENAI, + enableOpenAILogging: false, + }); + const request = { + model: 'test-model', + contents: 'Hello', + } as unknown as GenerateContentParameters; + + const stream = await generator.generateContentStream( + request, + 'prompt-stream-direct', + ); + for await (const _ of stream) { + // consume + } + + const record = getStreamSpanRecord(); + const meta = record.endMetadata as { + attempt?: number; + requestSetupMs?: number; + retryTotalDelayMs?: number; + }; + expect(meta.attempt).toBe(1); + expect(meta.requestSetupMs).toBeUndefined(); + expect(meta.retryTotalDelayMs).toBeUndefined(); + }); + + it('stream idle-timeout path: retrySnapshot propagates to the setTimeout-fired endLLMRequestSpan (R2 #8)', async () => { + // Review comment R2 #8: the idle-timeout `setTimeout` fires in a separate + // macrotask. Verify the closure-captured retrySnapshot reaches its + // endLLMRequestSpan call with correct retry context values. + // + // Must use fake timers from the START so the 5-min setTimeout created + // inside loggingStreamWrapper uses the fake clock and can be advanced. + vi.useFakeTimers(); + + const { retryContext } = await import('../../utils/retryContext.js'); + + // Stream that resolves its first .next() only after we've advanced + // timers past the idle timeout. We use a deferred promise to hold + // iteration without actually hanging the test runner. + let releaseStream: () => void; + const streamBlocker = new Promise((r) => { + releaseStream = r; + }); + const neverYieldStream = (async function* () { + await streamBlocker; // holds until we release after timer advance + yield createResponse('never', 'test-model', [{ text: 'x' }]); + })(); + + const streamFn = vi.fn().mockResolvedValue(neverYieldStream); + const wrapped = createWrappedGenerator(vi.fn(), streamFn); + const generator = new LoggingContentGenerator(wrapped, createConfig(), { + model: 'test-model', + authType: AuthType.USE_OPENAI, + enableOpenAILogging: false, + }); + const request = { + model: 'test-model', + contents: 'Hello', + } as unknown as GenerateContentParameters; + + // Start the stream inside a retry context. The generator creation + // (generateContentStream) runs synchronously enough to capture the + // retrySnapshot. The consumer's first .next() call starts the for-await + // which immediately awaits the streamBlocker — at that point the idle + // timeout setTimeout(5min) is already scheduled. + await retryContext.run( + { attempt: 4, requestSetupMs: 3000, retryTotalDelayMs: 2500 }, + async () => { + const stream = await generator.generateContentStream( + request, + 'prompt-idle-timeout', + ); + const iter = stream[Symbol.asyncIterator](); + // Start iteration — this enters for-await, resets the idle timer, + // then blocks on streamBlocker. + void iter.next(); + }, + ); + + // Advance past the 5-minute idle timeout (STREAM_IDLE_TIMEOUT_MS) + await vi.advanceTimersByTimeAsync(6 * 60_000); + + // Release the stream so the generator can clean up + releaseStream!(); + await vi.advanceTimersByTimeAsync(100); + + vi.useRealTimers(); + + // Find the span that was ended by the idle timeout + const records = loggingSpanRecords.filter( + (r) => r.name === 'qwen-code.llm_request' && r.endMetadata !== undefined, + ); + const timeoutRecord = records.find( + (r) => r.endMetadata?.error === 'Stream span timed out (idle)', + ); + expect(timeoutRecord).toBeDefined(); + const meta = timeoutRecord!.endMetadata as { + attempt?: number; + requestSetupMs?: number; + retryTotalDelayMs?: number; + error?: string; + }; + expect(meta.attempt).toBe(4); + expect(meta.requestSetupMs).toBe(3000); + expect(meta.retryTotalDelayMs).toBe(2500); + expect(meta.error).toBe('Stream span timed out (idle)'); + }); +}); diff --git a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts index 9a20db8f252..51257ac9ceb 100644 --- a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts +++ b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts @@ -62,6 +62,34 @@ import { API_CALL_FAILED_SPAN_STATUS_MESSAGE, } from '../../telemetry/tracer.js'; import { hasUserVisibleContent } from './streamContentDetection.js'; +import { + retryContext, + type RetryAttemptContext, +} from '../../utils/retryContext.js'; + +/** + * Phase 4b — read the active retry context once, default attempt to 1 when + * absent (warmup/side-queries/direct calls). Returns the fields in the exact + * shape consumed by `endLLMRequestSpan` so callers can spread the result. + * + * Called in the SYNCHRONOUS PRELUDE of `generateContent` / `generateContentStream` + * — before the first await — because the streaming path returns an + * AsyncGenerator that's iterated AFTER `retryWithBackoff` has resolved and + * the ALS frame has exited. The closure carries this snapshot to all later + * endLLMRequestSpan callsites (success / error / idle-timeout / abort). + */ +function snapshotRetryMetadata(): { + attempt: number; + requestSetupMs?: number; + retryTotalDelayMs?: number; +} { + const ctx: RetryAttemptContext | undefined = retryContext.getStore(); + return { + attempt: ctx?.attempt ?? 1, + requestSetupMs: ctx?.requestSetupMs, + retryTotalDelayMs: ctx?.retryTotalDelayMs, + }; +} const debugLogger = createDebugLogger('LOGGING_CONTENT_GENERATOR'); @@ -213,6 +241,10 @@ export class LoggingContentGenerator implements ContentGenerator { req: GenerateContentParameters, userPromptId: string, ): Promise { + // Phase 4b — snapshot retry context in the synchronous prelude BEFORE any + // await. ALS frame from `retryWithBackoff` is guaranteed to be active here. + const retrySnapshot = snapshotRetryMetadata(); + const llmSpan = startLLMRequestSpan(req.model, userPromptId); try { llmSpan.setAttribute('llm_request.stream', false); @@ -288,6 +320,7 @@ export class LoggingContentGenerator implements ContentGenerator { outputTokens: response.usageMetadata?.candidatesTokenCount, cachedInputTokens: response.usageMetadata?.cachedContentTokenCount, durationMs: Date.now() - startTime, + ...retrySnapshot, }); return response; } catch (error) { @@ -304,6 +337,7 @@ export class LoggingContentGenerator implements ContentGenerator { error: aborted ? API_CALL_ABORTED_SPAN_STATUS_MESSAGE : API_CALL_FAILED_SPAN_STATUS_MESSAGE, + ...retrySnapshot, }); await context.with(spanContext, async () => { this.safelyLogApiError('', durationMs, error, req.model, userPromptId); @@ -326,6 +360,15 @@ export class LoggingContentGenerator implements ContentGenerator { req: GenerateContentParameters, userPromptId: string, ): Promise> { + // Phase 4b — snapshot retry context in the synchronous prelude. This is + // the only point where the ALS frame from `retryWithBackoff` is guaranteed + // to be active for the streaming path: once this function returns the + // AsyncGenerator, the caller iterates AFTER `retryWithBackoff` has + // resolved and the frame has exited. Threaded as a parameter to + // loggingStreamWrapper so its closure carries the snapshot to all later + // endLLMRequestSpan callsites (success / error / idle-timeout / abort). + const retrySnapshot = snapshotRetryMetadata(); + const llmSpan = startLLMRequestSpan(req.model, userPromptId); try { llmSpan.setAttribute('llm_request.stream', true); @@ -383,6 +426,7 @@ export class LoggingContentGenerator implements ContentGenerator { error: aborted ? API_CALL_ABORTED_SPAN_STATUS_MESSAGE : API_CALL_FAILED_SPAN_STATUS_MESSAGE, + ...retrySnapshot, }); try { await this.safelyLogOpenAIInteraction( @@ -416,6 +460,7 @@ export class LoggingContentGenerator implements ContentGenerator { llmSpan, spanContext, req.config?.abortSignal, + retrySnapshot, ), ); } @@ -451,6 +496,12 @@ export class LoggingContentGenerator implements ContentGenerator { span?: Span, spanContext?: Context, abortSignal?: AbortSignal, + // Phase 4b — snapshot of retry context captured BEFORE the stream wrapper + // returned, when the ALS frame from `retryWithBackoff` was still active. + // Closure-carried to every endLLMRequestSpan callsite below so the + // idle-timeout `setTimeout` callback sees the same values as the + // entry-time read. + retrySnapshot?: ReturnType, ): AsyncGenerator { const isInternal = isInternalPromptId(userPromptId); // Skip collecting full responses for internal prompts to avoid memory @@ -504,6 +555,7 @@ export class LoggingContentGenerator implements ContentGenerator { success: false, durationMs: Date.now() - startTime, error: 'Stream span timed out (idle)', + ...retrySnapshot, }); spanEndedByTimeout = true; }, STREAM_IDLE_TIMEOUT_MS); @@ -626,6 +678,7 @@ export class LoggingContentGenerator implements ContentGenerator { ? API_CALL_ABORTED_SPAN_STATUS_MESSAGE : API_CALL_FAILED_SPAN_STATUS_MESSAGE : undefined, + ...retrySnapshot, }); } } diff --git a/packages/core/src/telemetry/constants.ts b/packages/core/src/telemetry/constants.ts index 8ef41eaef83..f69ab170285 100644 --- a/packages/core/src/telemetry/constants.ts +++ b/packages/core/src/telemetry/constants.ts @@ -29,6 +29,11 @@ export const EVENT_INVALID_CHUNK = 'qwen-code.chat.invalid_chunk'; export const EVENT_CONTENT_RETRY = 'qwen-code.chat.content_retry'; export const EVENT_CONTENT_RETRY_FAILURE = 'qwen-code.chat.content_retry_failure'; +// Phase 4b — HTTP-status retry telemetry emitted by `retryWithBackoff` for +// 429 / 5xx errors at LLM call sites. Distinct from EVENT_CONTENT_RETRY, +// which is fired by geminiChat for InvalidStreamError retries on a separate +// retry budget. See docs/design/telemetry-llm-request-timing-design.md. +export const EVENT_API_RETRY = 'qwen-code.api_retry'; export const EVENT_CONVERSATION_FINISHED = 'qwen-code.conversation_finished'; export const EVENT_MALFORMED_JSON_RESPONSE = 'qwen-code.malformed_json_response'; diff --git a/packages/core/src/telemetry/index.ts b/packages/core/src/telemetry/index.ts index cd8aad316b6..fce64dbfc3b 100644 --- a/packages/core/src/telemetry/index.ts +++ b/packages/core/src/telemetry/index.ts @@ -107,6 +107,7 @@ export { recordInvalidChunk, recordContentRetry, recordContentRetryFailure, + recordApiRetry, // Performance monitoring functions recordStartupPerformance, recordMemoryUsage, diff --git a/packages/core/src/telemetry/loggers.test.ts b/packages/core/src/telemetry/loggers.test.ts index b93d7e930af..b8632a4326d 100644 --- a/packages/core/src/telemetry/loggers.test.ts +++ b/packages/core/src/telemetry/loggers.test.ts @@ -58,6 +58,7 @@ import { logExtensionUninstall, logHookCall, logApiError, + logApiRetry, } from './loggers.js'; import * as metrics from './metrics.js'; import { QwenLogger } from './qwen-logger/qwen-logger.js'; @@ -82,6 +83,7 @@ import { ExtensionUninstallEvent, HookCallEvent, ApiErrorEvent, + ApiRetryEvent, } from './types.js'; import { FileOperation } from './metrics.js'; import type { @@ -1694,4 +1696,86 @@ describe('loggers', () => { expect(passedEvent).toBe(event); }); }); + + // Phase 4b — logApiRetry: HTTP-status retry telemetry from retryWithBackoff. + describe('logApiRetry (Phase 4b)', () => { + const mockQwenLogger = { + logApiRetryEvent: vi.fn(), + }; + + beforeEach(() => { + vi.spyOn(QwenLogger, 'getInstance').mockReturnValue( + mockQwenLogger as unknown as QwenLogger, + ); + mockQwenLogger.logApiRetryEvent.mockClear(); + vi.spyOn(metrics, 'recordApiRetry'); + }); + + function buildEvent( + overrides: Partial<{ + model: string; + promptId: string; + attemptNumber: number; + status: number; + delay: number; + errorMsg: string; + subagentName: string; + }> = {}, + ): ApiRetryEvent { + const err = new Error(overrides.errorMsg ?? 'rate limited'); + return new ApiRetryEvent({ + model: overrides.model ?? 'qwen3', + promptId: overrides.promptId ?? 'p-1', + attemptNumber: overrides.attemptNumber ?? 2, + error: err, + statusCode: overrides.status ?? 429, + retryDelayMs: overrides.delay ?? 1500, + subagentName: overrides.subagentName, + }); + } + + it('fans out to all 3 sinks: QwenLogger, OTel log, and metric counter', () => { + const mockConfig = makeFakeConfig({ sessionId: 'test-session-id' }); + const event = buildEvent(); + logApiRetry(mockConfig, event); + + // 1. QwenLogger RUM + expect(mockQwenLogger.logApiRetryEvent).toHaveBeenCalledWith(event); + // 2. OTel log signal — picked up by LogToSpanProcessor to bridge as span + expect(mockLogger.emit).toHaveBeenCalledTimes(1); + const logRecord = mockLogger.emit.mock.calls[0][0]; + expect(logRecord.body).toContain('API retry attempt 2'); + expect(logRecord.body).toContain('qwen3'); + expect(logRecord.body).toContain('status 429'); + expect(logRecord.attributes['event.name']).toBe('qwen-code.api_retry'); + expect(logRecord.attributes['attempt_number']).toBe(2); + expect(logRecord.attributes['retry_delay_ms']).toBe(1500); + expect(logRecord.attributes['status_code']).toBe(429); + expect(logRecord.attributes['model']).toBe('qwen3'); + // 3. Metric counter — tagged with {model} + expect(metrics.recordApiRetry).toHaveBeenCalledWith(mockConfig, { + model: 'qwen3', + }); + }); + + it('propagates subagent_name when present', () => { + const mockConfig = makeFakeConfig({ sessionId: 'test-session-id' }); + const event = buildEvent({ subagentName: 'explore-agent' }); + logApiRetry(mockConfig, event); + + const logRecord = mockLogger.emit.mock.calls[0][0]; + expect(logRecord.attributes['subagent_name']).toBe('explore-agent'); + }); + + it('skips logger.emit and metric counter when SDK is not initialized (QwenLogger still called)', () => { + vi.spyOn(sdk, 'isTelemetrySdkInitialized').mockReturnValue(false); + const mockConfig = makeFakeConfig({ sessionId: 'test-session-id' }); + const event = buildEvent(); + logApiRetry(mockConfig, event); + + expect(mockQwenLogger.logApiRetryEvent).toHaveBeenCalledWith(event); + expect(mockLogger.emit).not.toHaveBeenCalled(); + expect(metrics.recordApiRetry).not.toHaveBeenCalled(); + }); + }); }); diff --git a/packages/core/src/telemetry/loggers.ts b/packages/core/src/telemetry/loggers.ts index b45798fc7cd..716aa0e0cff 100644 --- a/packages/core/src/telemetry/loggers.ts +++ b/packages/core/src/telemetry/loggers.ts @@ -30,6 +30,7 @@ import { EVENT_CHAT_COMPRESSION, EVENT_CONTENT_RETRY, EVENT_CONTENT_RETRY_FAILURE, + EVENT_API_RETRY, EVENT_FILE_OPERATION, EVENT_RIPGREP_FALLBACK, EVENT_EXTENSION_INSTALL, @@ -57,6 +58,7 @@ import { recordChatCompressionMetrics, recordContentRetry, recordContentRetryFailure, + recordApiRetry, recordFileOperationMetric, recordInvalidChunk, recordModelSlashCommand, @@ -93,6 +95,7 @@ import type { ChatCompressionEvent, ContentRetryEvent, ContentRetryFailureEvent, + ApiRetryEvent, RipgrepFallbackEvent, ToolOutputTruncatedEvent, ExtensionDisableEvent, @@ -756,6 +759,38 @@ export function logContentRetryFailure( recordContentRetryFailure(config); } +/** + * Phase 4b — Emits an HTTP-status retry event fired from `retryWithBackoff` + * at an LLM call site (via the `onRetry` callback opt-in). Distinct from + * `logContentRetry`, which is fired by `geminiChat`'s content-recovery loop. + * + * Three-sink fan-out, matching the `logContentRetry` shape exactly: + * 1. QwenLogger RUM ingestion (Aliyun internal stats) + * 2. OTel log signal via `logger.emit()` — picked up by LogToSpanProcessor + * and bridged to a span sibling under the caller's active span (typically + * interaction or tool, NOT the failed LLM span — that span has already + * ended by the time onRetry fires). + * 3. `recordApiRetry` Counter increment for per-model retry-rate dashboards. + */ +export function logApiRetry(config: Config, event: ApiRetryEvent): void { + QwenLogger.getInstance(config)?.logApiRetryEvent(event); + if (!isTelemetrySdkInitialized()) return; + + const attributes: LogAttributes = { + ...getCommonAttributes(config), + ...event, + 'event.name': EVENT_API_RETRY, + }; + + const logger = logs.getLogger(SERVICE_NAME); + const logRecord: LogRecord = { + body: `API retry attempt ${event.attempt_number} for ${event.model} (status ${event.status_code ?? 'unknown'}).`, + attributes, + }; + logger.emit(logRecord); + recordApiRetry(config, { model: event.model }); +} + export function logSubagentExecution( config: Config, event: SubagentExecutionEvent, diff --git a/packages/core/src/telemetry/metrics.ts b/packages/core/src/telemetry/metrics.ts index 7d9de142ee5..e6c47efd50c 100644 --- a/packages/core/src/telemetry/metrics.ts +++ b/packages/core/src/telemetry/metrics.ts @@ -20,6 +20,9 @@ const FILE_OPERATION_COUNT = `${SERVICE_NAME}.file.operation.count`; const INVALID_CHUNK_COUNT = `${SERVICE_NAME}.chat.invalid_chunk.count`; const CONTENT_RETRY_COUNT = `${SERVICE_NAME}.chat.content_retry.count`; const CONTENT_RETRY_FAILURE_COUNT = `${SERVICE_NAME}.chat.content_retry_failure.count`; +// Phase 4b — Counts HTTP-status retries emitted by `retryWithBackoff` at LLM +// call sites. Tagged by `model` so operators can graph per-model retry rate. +const API_RETRY_COUNT = `${SERVICE_NAME}.api.retry.count`; const MODEL_SLASH_COMMAND_CALL_COUNT = `${SERVICE_NAME}.slash_command.model.call_count`; export const SUBAGENT_EXECUTION_COUNT = `${SERVICE_NAME}.subagent.execution.count`; @@ -135,6 +138,15 @@ const COUNTER_DEFINITIONS = { assign: (c: Counter) => (contentRetryFailureCounter = c), attributes: {} as Record, }, + [API_RETRY_COUNT]: { + description: + 'Counts HTTP-status retries (429/5xx) at LLM call sites, emitted by retryWithBackoff onRetry callback.', + valueType: ValueType.INT, + assign: (c: Counter) => (apiRetryCounter = c), + attributes: {} as { + model: string; + }, + }, [MODEL_SLASH_COMMAND_CALL_COUNT]: { description: 'Counts model slash command calls.', valueType: ValueType.INT, @@ -356,6 +368,7 @@ let chatCompressionCounter: Counter | undefined; let invalidChunkCounter: Counter | undefined; let contentRetryCounter: Counter | undefined; let contentRetryFailureCounter: Counter | undefined; +let apiRetryCounter: Counter | undefined; let subagentExecutionCounter: Counter | undefined; let modelSlashCommandCallCounter: Counter | undefined; @@ -626,6 +639,23 @@ export function recordContentRetryFailure(config: Config): void { ); } +/** + * Phase 4b — Records a metric for an HTTP-status retry at an LLM call site. + * Tagged by `model` so operators can graph per-model retry rate. Called from + * `logApiRetry` in loggers.ts which is wired to `retryWithBackoff`'s `onRetry` + * callback at the 4 LLM call sites. + */ +export function recordApiRetry( + config: Config, + attributes: MetricDefinitions[typeof API_RETRY_COUNT]['attributes'], +): void { + if (!apiRetryCounter || !isMetricsInitialized) return; + apiRetryCounter.add(1, { + ...baseMetricDefinition.getCommonAttributes(config), + ...attributes, + }); +} + export function recordModelSlashCommand( config: Config, event: ModelSlashCommandEvent, diff --git a/packages/core/src/telemetry/qwen-logger/qwen-logger.ts b/packages/core/src/telemetry/qwen-logger/qwen-logger.ts index b1a487ce675..0c932de84b8 100644 --- a/packages/core/src/telemetry/qwen-logger/qwen-logger.ts +++ b/packages/core/src/telemetry/qwen-logger/qwen-logger.ts @@ -31,6 +31,7 @@ import type { ChatCompressionEvent, InvalidChunkEvent, ContentRetryEvent, + ApiRetryEvent, ContentRetryFailureEvent, ConversationFinishedEvent, SubagentExecutionEvent, @@ -958,6 +959,26 @@ export class QwenLogger { this.flushIfNeeded(); } + // Phase 4b — HTTP-status retry from retryWithBackoff (429/5xx). Distinct from + // logContentRetryEvent which is fired by geminiChat's content-recovery loop. + logApiRetryEvent(event: ApiRetryEvent): void { + const rumEvent = this.createActionEvent('misc', 'api_retry', { + properties: { + model: event.model, + prompt_id: event.prompt_id ?? '', + attempt_number: event.attempt_number, + error_type: event.error_type ?? 'unknown', + status_code: + event.status_code !== undefined ? String(event.status_code) : '', + retry_delay_ms: event.retry_delay_ms, + subagent_name: event.subagent_name ?? '', + }, + }); + + this.enqueueLogEvent(rumEvent); + this.flushIfNeeded(); + } + // arena events logArenaSessionStartedEvent(event: ArenaSessionStartedEvent): void { const rumEvent = this.createActionEvent('arena', 'arena_session_started', { diff --git a/packages/core/src/telemetry/session-tracing.test.ts b/packages/core/src/telemetry/session-tracing.test.ts index 7a40c304532..9cd046026bc 100644 --- a/packages/core/src/telemetry/session-tracing.test.ts +++ b/packages/core/src/telemetry/session-tracing.test.ts @@ -521,33 +521,42 @@ describe('session-tracing', () => { durationMs: 1000, }); - // sampling_ms = duration - ttft - (requestSetup ?? 0) = 1000 - 200 - 0 + // sampling_ms = duration - ttft = 1000 - 200 (setup is NOT subtracted — + // duration_ms only covers ttft + sampling, never the setup phase that + // precedes the span. See Phase 4b commit fixing the formula bug.) expect(mockSpans[0]!.attributes['sampling_ms']).toBe(800); }); - it('endLLMRequestSpan derives sampling_ms accounting for requestSetupMs (Phase 4b populates this)', () => { + it('endLLMRequestSpan does NOT subtract requestSetupMs from sampling_ms (Phase 4b bug fix)', () => { + // Phase 4a's formula `duration - ttft - setup` double-counted setup + // because duration_ms ALREADY excludes setup (span starts after setup). + // Phase 4b populates requestSetupMs with cumulative retry overhead — + // if the formula still subtracted setup, sampling_ms would clamp to 0 + // for every retried request, wiping output-throughput data. const span = startLLMRequestSpan('m', 'p'); endLLMRequestSpan(span, { success: true, ttftMs: 200, - requestSetupMs: 300, + requestSetupMs: 300, // would yield 500 under old formula; we want 800 durationMs: 1000, }); - // sampling_ms = 1000 - 200 - 300 - expect(mockSpans[0]!.attributes['sampling_ms']).toBe(500); + expect(mockSpans[0]!.attributes['sampling_ms']).toBe(800); + // request_setup_ms is still emitted as its own attribute — operators can + // see the retry overhead AND the sampling time independently. + expect(mockSpans[0]!.attributes['request_setup_ms']).toBe(300); }); - it('endLLMRequestSpan clamps sampling_ms to 0 when ttft + setup exceed duration (clock skew)', () => { + it('endLLMRequestSpan clamps sampling_ms to 0 when ttft exceeds duration (clock skew)', () => { const span = startLLMRequestSpan('m', 'p'); endLLMRequestSpan(span, { success: true, - ttftMs: 800, - requestSetupMs: 500, + ttftMs: 1500, durationMs: 1000, }); - // Math.max(0, 1000 - 800 - 500) = 0 + // Math.max(0, 1000 - 1500) = 0 — only triggers when ttft > duration, + // which in practice means clock drift or a measurement bug. expect(mockSpans[0]!.attributes['sampling_ms']).toBe(0); }); diff --git a/packages/core/src/telemetry/session-tracing.ts b/packages/core/src/telemetry/session-tracing.ts index ee53a56091d..c62f65fe32e 100644 --- a/packages/core/src/telemetry/session-tracing.ts +++ b/packages/core/src/telemetry/session-tracing.ts @@ -70,20 +70,27 @@ export interface LLMRequestMetadata { */ ttftMs?: number; /** - * Time from generateContent/generateContentStream entry to the start of the - * successful attempt (ms). Includes all failed retries + backoff sleeps. - * Populated by the retry layer in Phase 4b; undefined in Phase 4a. + * Time from `retryWithBackoff` entry to THIS attempt's start (ms). On a + * successful-attempt span this doubles as the total retry overhead before + * success. On a failed-attempt span this is the cumulative time elapsed in + * the retry budget at the moment this attempt fired (= attempts 1..N-1's + * durations + their backoff sleeps). + * + * Undefined when no retry context exists (direct calls bypassing + * retryWithBackoff: warmup, side-queries, etc.). Populated by the retry + * layer in Phase 4b via AsyncLocalStorage (`retryContext`). */ requestSetupMs?: number; /** - * Final attempt number (1-based). 1 = no retries. Populated by the retry - * layer in Phase 4b; undefined in Phase 4a. + * 1-based monotonic attempt counter, populated by LoggingContentGenerator + * from `retryContext.getStore()`. Defaults to 1 when no retry context is + * present so dashboards filtering `WHERE attempt=1` include direct/warmup + * calls. Populated by Phase 4b retry layer for attempt >= 2. */ attempt?: number; /** - * Sum of all backoff delays before the successful attempt (ms). 0 if no - * retries occurred. Populated by the retry layer in Phase 4b; undefined - * in Phase 4a. + * Sum of all backoff delays BEFORE this attempt started (ms). 0 for attempt 1. + * Undefined when no retry context exists. Populated by Phase 4b retry layer. */ retryTotalDelayMs?: number; } @@ -438,15 +445,21 @@ export function endLLMRequestSpan( endAttributes['retry_total_delay_ms'] = metadata.retryTotalDelayMs; } // Derived: sampling_ms = time from first user-visible chunk to end - // (== output generation time, excluding setup + first-token delay). - // Computable only when ttftMs is set. requestSetupMs defaults to 0 - // when undefined (no retries) — this gives the correct sampling - // duration in both Phase 4a (no retry data) and Phase 4b (with). + // (== output generation time for THIS attempt). + // + // NOTE on Phase 4a bug fix: previous formula `duration - ttft - setup` + // double-counted the setup time. `duration_ms` is computed as + // `Date.now() - spanCtx.startTime`, and startTime is captured when + // `startLLMRequestSpan` runs — which is AFTER `requestSetupMs` worth of + // overhead has already passed. So the span's `duration_ms` only covers + // `ttft + sampling`, never the preceding setup. Subtracting `setup` again + // is wrong. In Phase 4a, `requestSetupMs` was always undefined so the + // bug was masked (0 subtraction). Phase 4b populates `requestSetupMs` + // with cumulative retry overhead, which would have clamped sampling_ms + // to 0 for every retried request — wiping out output-throughput data + // exactly when operators need it most. Fixed here. if (metadata.ttftMs !== undefined) { - const samplingMs = Math.max( - 0, - duration - metadata.ttftMs - (metadata.requestSetupMs ?? 0), - ); + const samplingMs = Math.max(0, duration - metadata.ttftMs); endAttributes['sampling_ms'] = samplingMs; // Derived: output tokens per second during sampling. Undefined when // sampling_ms is 0 (avoid divide-by-zero) or when outputTokens missing. diff --git a/packages/core/src/telemetry/types.ts b/packages/core/src/telemetry/types.ts index 866d0637dbe..fcb8ca8d017 100644 --- a/packages/core/src/telemetry/types.ts +++ b/packages/core/src/telemetry/types.ts @@ -646,6 +646,69 @@ export class ContentRetryEvent implements BaseTelemetryEvent { } } +/** + * Phase 4b — HTTP-status retry telemetry. Emitted by `retryWithBackoff` (via + * the `onRetry` callback opt-in) for HTTP 429 / 5xx retries at LLM call sites. + * + * Distinct from {@link ContentRetryEvent}, which is emitted by `geminiChat`'s + * for-loop for `InvalidStreamError` retries that go through a SEPARATE retry + * budget (`INVALID_CONTENT_RETRY_OPTIONS`, NOT `retryWithBackoff`). A single + * user prompt may fire BOTH event types; sum across event types to count total + * retries per prompt_id. + */ +export class ApiRetryEvent implements BaseTelemetryEvent { + 'event.name': 'api_retry'; + 'event.timestamp': string; // ISO 8601 + model: string; + prompt_id?: string; + attempt_number: number; // 1-based monotonic counter (matches ALS retryContext.attempt) + error_type?: string; + error_message: string; + status_code?: number | string; + retry_delay_ms: number; + /** + * Reports the backoff delay following this failed attempt (NOT the attempt's + * own duration — that lives on the corresponding `qwen-code.llm_request` + * span's `duration_ms` attribute). Set equal to `retry_delay_ms` so the + * LogToSpanProcessor bridge span visualises the sleep window between the + * failed and next attempt in the trace timeline. + */ + duration_ms: number; + /** + * Name of the subagent that issued the retrying request, or undefined when + * the request originates from the main conversation. Read from + * `subagentNameContext.getStore()` at the caller site (subagentNameContext + * is still active inside `retry.ts`'s catch block where `onRetry` fires). + */ + subagent_name?: string; + + constructor(opts: { + model: string; + promptId?: string; + attemptNumber: number; + error: unknown; + statusCode?: number | string; + retryDelayMs: number; + subagentName?: string; + }) { + this['event.name'] = 'api_retry'; + this['event.timestamp'] = new Date().toISOString(); + this.model = opts.model; + this.prompt_id = opts.promptId; + this.attempt_number = opts.attemptNumber; + this.error_message = + opts.error instanceof Error + ? opts.error.message + : String(opts.error ?? 'unknown error'); + this.error_type = + opts.error instanceof Error ? opts.error.constructor.name : undefined; + this.status_code = opts.statusCode; + this.retry_delay_ms = opts.retryDelayMs; + this.duration_ms = opts.retryDelayMs; + this.subagent_name = opts.subagentName; + } +} + export class ContentRetryFailureEvent implements BaseTelemetryEvent { 'event.name': 'content_retry_failure'; 'event.timestamp': string; @@ -963,6 +1026,7 @@ export type TelemetryEvent = | InvalidChunkEvent | ContentRetryEvent | ContentRetryFailureEvent + | ApiRetryEvent | SubagentExecutionEvent | ExtensionEnableEvent | ExtensionInstallEvent diff --git a/packages/core/src/utils/retry.test.ts b/packages/core/src/utils/retry.test.ts index 0fd2478602a..cfcb13cc804 100644 --- a/packages/core/src/utils/retry.test.ts +++ b/packages/core/src/utils/retry.test.ts @@ -14,12 +14,13 @@ import { afterEach, afterAll, } from 'vitest'; -import type { HttpError } from './retry.js'; +import type { HttpError, RetryAttemptInfo } from './retry.js'; import { retryWithBackoff, isTransientCapacityError, isUnattendedMode, } from './retry.js'; +import { retryContext } from './retryContext.js'; import { getErrorStatus } from './errors.js'; import { setSimulate429 } from './testUtils.js'; import { AuthType } from '../core/contentGenerator.js'; @@ -1024,3 +1025,404 @@ describe('getErrorStatus', () => { expect(getErrorStatus(new Error('HTTP_STATUS/4291'))).toBeUndefined(); }); }); + +// ========================================================================= +// Phase 4b — retry telemetry (ALS context + onRetry callback + monotonic counter) +// ========================================================================= +describe('retryWithBackoff — Phase 4b retry context (ALS)', () => { + beforeEach(() => { + // Use fake timers consistently with the rest of this file — vitest's + // useRealTimers between describes is unreliable when other describes + // have stubbed timer globals. We advance via vi.runAllTimersAsync(). + vi.useFakeTimers(); + setSimulate429(false); + console.warn = vi.fn(); + }); + + afterEach(() => { + vi.useRealTimers(); + vi.restoreAllMocks(); + }); + + it('sets retryContext.attempt monotonically across attempts', async () => { + const seenAttempts: number[] = []; + let attempts = 0; + const fn = vi.fn(async () => { + attempts++; + seenAttempts.push(retryContext.getStore()?.attempt ?? -1); + if (attempts <= 2) { + const err: HttpError = new Error('transient'); + err.status = 500; + throw err; + } + return 'ok'; + }); + + const promise = retryWithBackoff(fn, { + maxAttempts: 5, + initialDelayMs: 1, + maxDelayMs: 5, + }); + + await vi.runAllTimersAsync(); + await expect(promise).resolves.toBe('ok'); + expect(seenAttempts).toEqual([1, 2, 3]); + }); + + it('exposes retryContext.requestSetupMs / retryTotalDelayMs (== 0 for attempt 1, > 0 for retries)', async () => { + const snapshots: Array<{ setupMs: number; totalDelayMs: number }> = []; + let attempts = 0; + const fn = vi.fn(async () => { + attempts++; + const ctx = retryContext.getStore(); + snapshots.push({ + setupMs: ctx?.requestSetupMs ?? -1, + totalDelayMs: ctx?.retryTotalDelayMs ?? -1, + }); + if (attempts <= 2) { + const err: HttpError = new Error('transient'); + err.status = 500; + throw err; + } + return 'ok'; + }); + + const promise = retryWithBackoff(fn, { + maxAttempts: 5, + initialDelayMs: 10, + maxDelayMs: 50, + }); + + await vi.runAllTimersAsync(); + await promise; + + // Attempt 1: nothing happened before, so both are 0. + expect(snapshots[0]!.setupMs).toBe(0); + expect(snapshots[0]!.totalDelayMs).toBe(0); + // Attempts 2+: both fields populate with positive values once retries + // have run. Exact values depend on the jittered backoff; assert monotonic. + expect(snapshots[1]!.setupMs).toBeGreaterThanOrEqual(0); + expect(snapshots[1]!.totalDelayMs).toBeGreaterThan(0); + expect(snapshots[2]!.setupMs).toBeGreaterThanOrEqual(snapshots[1]!.setupMs); + expect(snapshots[2]!.totalDelayMs).toBeGreaterThan( + snapshots[1]!.totalDelayMs, + ); + }); + + it('first-try success: retryContext.attempt === 1, both delays === 0, onRetry never called', async () => { + let observed: { attempt: number; setup: number; delay: number } | null = + null; + const onRetry = vi.fn(); + const fn = vi.fn(async () => { + const ctx = retryContext.getStore(); + observed = { + attempt: ctx?.attempt ?? -1, + setup: ctx?.requestSetupMs ?? -1, + delay: ctx?.retryTotalDelayMs ?? -1, + }; + return 'ok'; + }); + + const promise = retryWithBackoff(fn, { + maxAttempts: 5, + initialDelayMs: 1, + maxDelayMs: 5, + onRetry, + }); + + await vi.runAllTimersAsync(); + await promise; + + expect(observed).toEqual({ attempt: 1, setup: 0, delay: 0 }); + expect(onRetry).not.toHaveBeenCalled(); + }); + + it('onRetry callback fires once per failed attempt with correct args', async () => { + const onRetry = vi.fn(); + const fn = createFailingFunction(2, 'ok'); + + const promise = retryWithBackoff(fn, { + maxAttempts: 5, + initialDelayMs: 1, + maxDelayMs: 5, + onRetry, + }); + + await vi.runAllTimersAsync(); + await promise; + + // 2 failures -> 2 onRetry invocations + expect(onRetry).toHaveBeenCalledTimes(2); + const first = onRetry.mock.calls[0]![0] as RetryAttemptInfo; + expect(first.attempt).toBe(1); + expect(first.errorStatus).toBe(500); + expect((first.error as Error).message).toContain('attempt 1'); + expect(first.delayMs).toBeGreaterThanOrEqual(0); + const second = onRetry.mock.calls[1]![0] as RetryAttemptInfo; + expect(second.attempt).toBe(2); + }); + + it('absence of onRetry is silent (no exception)', async () => { + const fn = createFailingFunction(1, 'ok'); + // No onRetry passed. Must not throw or warn. + const promise = retryWithBackoff(fn, { + maxAttempts: 5, + initialDelayMs: 1, + maxDelayMs: 5, + }); + await vi.runAllTimersAsync(); + await expect(promise).resolves.toBe('ok'); + }); + + it('onRetry callback throwing does NOT break the retry loop', async () => { + const onRetry = vi.fn(() => { + throw new Error('telemetry blew up'); + }); + const fn = createFailingFunction(2, 'ok'); + + const promise = retryWithBackoff(fn, { + maxAttempts: 5, + initialDelayMs: 1, + maxDelayMs: 5, + onRetry, + }); + + await vi.runAllTimersAsync(); + await expect(promise).resolves.toBe('ok'); + expect(onRetry).toHaveBeenCalledTimes(2); + }); + + it('shouldRetryOnError returns false mid-loop: onRetry not called for the giveup', async () => { + // Attempt 1 fails with 500 (retryable), attempt 2 fails with 400 + // (non-retryable). Retry loop gives up on attempt 2 without invoking + // onRetry for it. + const onRetry = vi.fn(); + let n = 0; + const fn = vi.fn(async () => { + n++; + const err: HttpError = new Error(`attempt ${n}`); + err.status = n === 1 ? 500 : 400; + throw err; + }); + + // Attach .catch() BEFORE the timer runs, so Vitest sees the promise has + // a handler when the rejection lands (avoids unhandled-rejection warnings). + const caught = retryWithBackoff(fn, { + maxAttempts: 5, + initialDelayMs: 1, + maxDelayMs: 5, + shouldRetryOnError: (e) => + (e as HttpError).status === 500 || (e as HttpError).status === 429, + onRetry, + }).catch((e: unknown) => e); + await vi.runAllTimersAsync(); + const error = await caught; + expect((error as Error).message).toBe('attempt 2'); + + // Only the FIRST failed attempt invoked onRetry (it led to a retry). + // The second failed attempt aborted the loop and did not. + expect(onRetry).toHaveBeenCalledTimes(1); + expect(onRetry.mock.calls[0]![0].attempt).toBe(1); + }); + + it('parallel retryWithBackoff calls maintain independent attempt counters', async () => { + // Two concurrent retryWithBackoff invocations must each see their own + // ALS context (AsyncLocalStorage isolates them by async chain). + const callA: number[] = []; + const callB: number[] = []; + + const makeFn = (sink: number[]) => { + let n = 0; + return vi.fn(async () => { + n++; + sink.push(retryContext.getStore()?.attempt ?? -1); + if (n <= 1) { + const err: HttpError = new Error('boom'); + err.status = 500; + throw err; + } + return 'ok'; + }); + }; + + const both = Promise.all([ + retryWithBackoff(makeFn(callA), { + maxAttempts: 5, + initialDelayMs: 1, + maxDelayMs: 3, + }), + retryWithBackoff(makeFn(callB), { + maxAttempts: 5, + initialDelayMs: 1, + maxDelayMs: 3, + }), + ]); + + await vi.runAllTimersAsync(); + await both; + + expect(callA).toEqual([1, 2]); + expect(callB).toEqual([1, 2]); + }); + + it('nested retryWithBackoff reads innermost frame', async () => { + const observed: Array<{ + layer: 'outer' | 'inner'; + attempt: number; + }> = []; + let innerAttempts = 0; + + const inner = vi.fn(async () => { + innerAttempts++; + observed.push({ + layer: 'inner', + attempt: retryContext.getStore()?.attempt ?? -1, + }); + if (innerAttempts <= 1) { + const err: HttpError = new Error('inner-fail'); + err.status = 500; + throw err; + } + return 'inner-ok'; + }); + + const outer = vi.fn(async () => { + observed.push({ + layer: 'outer', + attempt: retryContext.getStore()?.attempt ?? -1, + }); + return await retryWithBackoff(inner, { + maxAttempts: 5, + initialDelayMs: 1, + maxDelayMs: 3, + }); + }); + + const promise = retryWithBackoff(outer, { + maxAttempts: 1, + initialDelayMs: 1, + maxDelayMs: 3, + }); + + await vi.runAllTimersAsync(); + await promise; + + // Outer call sees its own frame's attempt (1). + // Inner calls see their own frame's attempt (1, then 2 after retry). + // Inner DOES NOT see the outer's frame. + expect(observed).toEqual([ + { layer: 'outer', attempt: 1 }, + { layer: 'inner', attempt: 1 }, + { layer: 'inner', attempt: 2 }, + ]); + }); + + it('persistent mode (status=429): onRetry fires with correct attempt + delayMs from persistent backoff', async () => { + // Review comment R1 #4 + R2 #3: the highest-volume production retry path + // (429 → persistent mode) was untested. Verify onRetry fires with the + // monotonic iterationCount and a reasonable backoff delay. + const onRetry = vi.fn(); + let n = 0; + const fn = vi.fn(async () => { + n++; + if (n <= 2) { + const err: HttpError = new Error(`rate limited #${n}`); + err.status = 429; + throw err; + } + return 'ok'; + }); + + const promise = retryWithBackoff(fn, { + maxAttempts: 5, + initialDelayMs: 50, + maxDelayMs: 200, + persistentMode: true, + onRetry, + }); + + await vi.runAllTimersAsync(); + await expect(promise).resolves.toBe('ok'); + + expect(onRetry).toHaveBeenCalledTimes(2); + const first = onRetry.mock.calls[0]![0] as RetryAttemptInfo; + expect(first.attempt).toBe(1); + expect(first.errorStatus).toBe(429); + expect(first.delayMs).toBeGreaterThan(0); + const second = onRetry.mock.calls[1]![0] as RetryAttemptInfo; + expect(second.attempt).toBe(2); + expect(second.errorStatus).toBe(429); + // Persistent mode uses exponential backoff — second delay >= first + expect(second.delayMs).toBeGreaterThanOrEqual(first.delayMs); + }); + + it('normal retry with Retry-After header: onRetry receives the header-derived delayMs', async () => { + // Review comment R2 #7: verify that when the error includes a + // `retry-after` header, `onRetry.delayMs` reflects the parsed value + // (not the exponential backoff calculation). + const onRetry = vi.fn(); + let n = 0; + const fn = vi.fn(async () => { + n++; + if (n <= 1) { + const err = new Error('rate limited') as HttpError & { + response?: { headers?: { 'retry-after'?: string } }; + }; + err.status = 429; + err.response = { headers: { 'retry-after': '2' } }; // 2 seconds + throw err; + } + return 'ok'; + }); + + const promise = retryWithBackoff(fn, { + maxAttempts: 5, + initialDelayMs: 100, + maxDelayMs: 500, + onRetry, + }); + + await vi.runAllTimersAsync(); + await expect(promise).resolves.toBe('ok'); + + expect(onRetry).toHaveBeenCalledTimes(1); + const info = onRetry.mock.calls[0]![0] as RetryAttemptInfo; + // Retry-After: 2 → 2000ms + expect(info.delayMs).toBe(2000); + expect(info.errorStatus).toBe(429); + }); + + it('signal.aborted before onRetry: no phantom retry event emitted', async () => { + // Review comment R2 #6: when signal fires between catch and onRetry, + // the guard `if (!signal?.aborted)` should prevent onRetry from firing. + const onRetry = vi.fn(); + const controller = new AbortController(); + let n = 0; + const fn = vi.fn(async () => { + n++; + if (n === 1) { + // Abort the signal during the first failure — before onRetry runs + controller.abort(); + const err: HttpError = new Error('server error'); + err.status = 500; + throw err; + } + return 'ok'; + }); + + // The retry loop should detect the aborted signal and NOT fire onRetry. + const promise = retryWithBackoff(fn, { + maxAttempts: 5, + initialDelayMs: 10, + maxDelayMs: 50, + signal: controller.signal, + onRetry, + }).catch((e: unknown) => e); + + await vi.runAllTimersAsync(); + await promise; + + // onRetry should NOT have been called because signal was aborted + expect(onRetry).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/core/src/utils/retry.ts b/packages/core/src/utils/retry.ts index 54b0b6db9e0..12bed9e6826 100644 --- a/packages/core/src/utils/retry.ts +++ b/packages/core/src/utils/retry.ts @@ -9,6 +9,7 @@ import { AuthType } from '../core/contentGenerator.js'; import { isQwenQuotaExceededError } from './quotaErrorDetection.js'; import { createDebugLogger } from './debugLogger.js'; import { getErrorStatus } from './errors.js'; +import { retryContext } from './retryContext.js'; const debugLogger = createDebugLogger('RETRY'); @@ -27,6 +28,22 @@ export interface HeartbeatInfo { error: unknown; } +/** + * Information passed to `RetryOptions.onRetry` after each failed attempt. + * Lets callers (LLM call sites) emit `ApiRetryEvent` telemetry without + * coupling `retry.ts` to telemetry concerns. + */ +export interface RetryAttemptInfo { + /** + * 1-based monotonic iteration counter — same value as ALS context's `attempt`. + */ + attempt: number; + error: unknown; + errorStatus?: number; + /** Computed backoff delay that follows this failed attempt (ms). */ + delayMs: number; +} + export interface RetryOptions { maxAttempts: number; initialDelayMs: number; @@ -41,6 +58,25 @@ export interface RetryOptions { heartbeatIntervalMs?: number; heartbeatFn?: (info: HeartbeatInfo) => void; signal?: AbortSignal; + /** + * Optional. Called once per failed attempt after the backoff delay is + * computed but BEFORE the sleep. Use this to emit retry telemetry events + * (e.g. `ApiRetryEvent` for LLM call sites); leave undefined for non-LLM + * callers so they stay silent in LLM-specific telemetry channels. + * + * Contract: + * - Invoked only after `await fn()` rejects in the catch block of + * `retryWithBackoff` (OUTSIDE the `retryContext.run()` ALS frame). + * This is true for both synchronous and asynchronous throws from `fn`. + * All retry-context data is passed via the `RetryAttemptInfo` parameter + * — do NOT read `retryContext.getStore()` inside an `onRetry` callback. + * - Content-retries via `shouldRetryOnContent` do NOT fire `onRetry`. + * If a future caller wires content retries, extend `retry.ts` to fire + * `onRetry` on that path too. + * - Callback errors are swallowed and logged via `debugLogger.warn`; they + * never affect retry behavior (best-effort telemetry). + */ + onRetry?: (info: RetryAttemptInfo) => void; } const DEFAULT_RETRY_OPTIONS: RetryOptions = { @@ -162,6 +198,7 @@ export async function retryWithBackoff( heartbeatIntervalMs, heartbeatFn, signal, + onRetry, } = { ...DEFAULT_RETRY_OPTIONS, ...cleanOptions, @@ -176,10 +213,24 @@ export async function retryWithBackoff( let persistentAttempt = 0; let currentDelay = initialDelayMs; + // Phase 4b — retry telemetry context. `iterationCount` is the monotonic + // counter that always reflects "this is the Nth time fn was called", + // regardless of normal vs persistent retry mode. Decoupled from the + // `attempt` variable above which is clamped at `maxAttempts - 1` in + // persistent mode to keep the while-loop alive. + const requestEntryTime = Date.now(); + let iterationCount = 0; + let retryTotalDelayMs = 0; + while (attempt < maxAttempts) { attempt++; + iterationCount++; + const requestSetupMs = Date.now() - requestEntryTime; try { - const result = await fn(); + const result = await retryContext.run( + { attempt: iterationCount, retryTotalDelayMs, requestSetupMs }, + () => fn(), + ); if ( shouldRetryOnContent && @@ -188,6 +239,12 @@ export async function retryWithBackoff( const jitter = currentDelay * 0.3 * (Math.random() * 2 - 1); const delayWithJitter = Math.max(0, currentDelay + jitter); await delay(delayWithJitter); + // Note: this inflates retryTotalDelayMs beyond what onRetry/ApiRetryEvent + // reports — content-retry delays are invisible in the api_retry telemetry + // channel (onRetry only fires from the catch-block error path). The LLM + // span's retry_total_delay_ms attribute includes ALL delays (content + + // error), which is the accurate "total time the user waited in backoff." + retryTotalDelayMs += delayWithJitter; currentDelay = Math.min(maxDelayMs, currentDelay * 2); continue; } @@ -256,6 +313,27 @@ export async function retryWithBackoff( error, ); + // Phase 4b — fire onRetry telemetry callback BEFORE sleep, so + // operators see retry events live. Guard with signal?.aborted so we + // don't emit a phantom retry event for an attempt that will never + // actually proceed (signal fires during the previous sleep or between + // catch and this point). Wrap in try/catch: a logging failure must + // NEVER break the retry loop. + if (!signal?.aborted) { + try { + onRetry?.({ + attempt: iterationCount, + error, + errorStatus, + delayMs, + }); + } catch (cbError) { + debugLogger.warn( + `onRetry callback threw (swallowed): ${cbError instanceof Error ? cbError.message : String(cbError)}`, + ); + } + } + // Heartbeat sleep — chunked to keep CI alive await sleepWithHeartbeat(delayMs, { attempt: reportedAttempt, @@ -264,30 +342,53 @@ export async function retryWithBackoff( heartbeatFn, signal, }); + retryTotalDelayMs += delayMs; // Clamp attempt so the while-loop never exits if (attempt >= maxAttempts) { attempt = maxAttempts - 1; } } else { - // Normal retry path (unchanged behavior) + // Normal retry path const retryAfterMs = errorStatus === 429 ? getRetryAfterDelayMs(error) : 0; + let actualDelayMs: number; if (retryAfterMs > 0) { debugLogger.warn( `Attempt ${attempt} failed with status ${errorStatus ?? 'unknown'}. Retrying after explicit delay of ${retryAfterMs}ms...`, error, ); - await delay(retryAfterMs); + actualDelayMs = retryAfterMs; currentDelay = initialDelayMs; } else { logRetryAttempt(attempt, error, errorStatus); const jitter = currentDelay * 0.3 * (Math.random() * 2 - 1); - const delayWithJitter = Math.max(0, currentDelay + jitter); - await delay(delayWithJitter); + actualDelayMs = Math.max(0, currentDelay + jitter); currentDelay = Math.min(maxDelayMs, currentDelay * 2); } + + // Phase 4b — fire onRetry telemetry callback BEFORE sleep. Guard + // with signal?.aborted to avoid phantom events when abort fires + // between catch and here. Wrapped in try/catch so a logging failure + // cannot break the retry loop. + if (!signal?.aborted) { + try { + onRetry?.({ + attempt: iterationCount, + error, + errorStatus, + delayMs: actualDelayMs, + }); + } catch (cbError) { + debugLogger.warn( + `onRetry callback threw (swallowed): ${cbError instanceof Error ? cbError.message : String(cbError)}`, + ); + } + } + + await delay(actualDelayMs); + retryTotalDelayMs += actualDelayMs; } } } diff --git a/packages/core/src/utils/retryContext.ts b/packages/core/src/utils/retryContext.ts new file mode 100644 index 00000000000..fac90c8758a --- /dev/null +++ b/packages/core/src/utils/retryContext.ts @@ -0,0 +1,40 @@ +/** + * @license + * Copyright 2026 Qwen Team + * SPDX-License-Identifier: Apache-2.0 + */ + +import { AsyncLocalStorage } from 'node:async_hooks'; + +/** + * Per-attempt retry context propagated through AsyncLocalStorage from + * `retryWithBackoff` down to `LoggingContentGenerator`. Lets each per-attempt + * `qwen-code.llm_request` span carry meaningful `attempt` / `request_setup_ms` + * / `retry_total_delay_ms` attributes without changing the LLM API surface. + * + * See docs/design/telemetry-llm-request-timing-design.md (Phase 4b, D4). + */ +export interface RetryAttemptContext { + /** + * 1-based monotonic iteration counter for the current `retryWithBackoff` + * execution. Always reflects "this is the Nth time fn was called", + * regardless of normal vs persistent retry mode. Unaffected by the + * `attempt = maxAttempts - 1` clamping that keeps the persistent-mode loop + * alive. + */ + readonly attempt: number; + /** + * Sum of all backoff delays BEFORE this attempt started (ms). 0 for attempt 1. + * Accumulates across the retry loop. + */ + readonly retryTotalDelayMs: number; + /** + * Time from `retryWithBackoff` entry to THIS attempt's start (ms). 0 for + * attempt 1 of a no-retry success. For attempt N>1, equals cumulative time + * spent in attempts 1..N-1 plus their backoff sleeps. Computed in `retry.ts` + * immediately before `await fn()` to avoid measurement drift across layers. + */ + readonly requestSetupMs: number; +} + +export const retryContext = new AsyncLocalStorage();