diff --git a/docs/design/telemetry-llm-request-timing-design.md b/docs/design/telemetry-llm-request-timing-design.md
index 4a41b082d16..e1b1b06eb9f 100644
--- a/docs/design/telemetry-llm-request-timing-design.md
+++ b/docs/design/telemetry-llm-request-timing-design.md
@@ -126,7 +126,13 @@ When `attempt === 1` and no retries happened, `request_setup_ms` is small (just
 2. **Single-trace debug** — operator sees `duration_ms=12000, request_setup_ms=11500, ttft_ms=200, sampling_ms=300` → instantly diagnoses "retries ate 11.5s, model itself was fast." Computing `request_setup_ms` from other fields requires also exposing `sampling_ms`, which we do anyway (D6).
 3. **Negligible cost** — 1 INT64 attribute. Same order of magnitude as the existing `input_tokens`, `output_tokens` attributes. Backend ingest cost is not material.
 
-### D4 — Retry telemetry: `onRetry` callback option on `retryWithBackoff` + new `ApiRetryEvent`
+### D4 — Retry telemetry: `onRetry` callback option on `retryWithBackoff` + `ApiRetryEvent` + AsyncLocalStorage propagation
+
+> **Phase 4b update (post-design discovery)**: this section was originally written assuming claude-code's "one LLM span owns the retry loop" pattern. While implementing Phase 4b, we discovered that qwen-code's 4 `retryWithBackoff` call sites (`client.ts:2109`, `baseLlmClient.ts:235,333`, `geminiChat.ts:2035` — line numbers as of merge) all wrap `apiCall = () => contentGenerator.generateContent(...)`. The retry layer sits **above** LoggingContentGenerator. Each retry attempt invokes `apiCall()` fresh → fresh `qwen-code.llm_request` span. There is no single shared span across attempts. An in-`LoggingContentGenerator` accumulator wouldn't work.
+>
+> **Resolution**: propagate retry state via `AsyncLocalStorage` (`retryContext` in `packages/core/src/utils/retryContext.ts`). `retryWithBackoff` wraps each `await fn()` in `retryContext.run({ attempt, requestSetupMs, retryTotalDelayMs }, fn)`. `LoggingContentGenerator` reads the ALS in its synchronous prelude and forwards the values to `endLLMRequestSpan`. This actually gives **richer** observability than the original plan — each per-attempt span has its own `duration_ms` / `ttft_ms` / error details AND knows where in the retry budget it sits via the per-attempt `attempt` / `requestSetupMs` / `retryTotalDelayMs` attributes.
+>
+> The ALS approach matches existing patterns in the codebase (`promptIdContext`, `subagentNameContext`, `agent-context`) — minimal new surface, well-understood semantics. Plan-mode review process captured this revision through 3 review rounds finding 22 issues, all addressed before merge.
 
 `retryWithBackoff` currently calls `logRetryAttempt` (`retry.ts:343`) which only writes to `debugLogger.warn`. We extend the `RetryOptions` interface with an opt-in callback:
 
@@ -188,14 +194,14 @@ export class ApiRetryEvent implements BaseTelemetryEvent {
 
 OTel span attributes are scalars (`string | number | boolean | array of these`). Map-typed attributes (like `retry_count_by_status: {429:2, 503:1}`) require JSON serialization and are awkward to query. Skip them.
 
-| Attribute                  | Type   | Semantic                                                                            |
-| -------------------------- | ------ | ----------------------------------------------------------------------------------- |
-| `attempt`                  | int    | 1-based final attempt count (`attemptStartTimes.length`)                            |
-| `retry_total_delay_ms`     | int    | Sum of all `delayMs` reported by `onRetry`; 0 if no retries                         |
-| `ttft_ms`                  | int    | TTFT per D1; undefined for non-streaming or aborted-before-first-chunk requests     |
-| `request_setup_ms`         | int    | Per D3                                                                              |
-| `sampling_ms`              | int    | Per D6                                                                              |
-| `output_tokens_per_second` | double | Derived; `output_tokens / (sampling_ms / 1000)`; undefined when `sampling_ms === 0` |
+| Attribute                  | Type   | Semantic                                                                                                                                 |
+| -------------------------- | ------ | ---------------------------------------------------------------------------------------------------------------------------------------- |
+| `attempt`                  | int    | 1-based monotonic counter from `retryContext.attempt` (this attempt's iteration). Always populated (defaults to 1 when no retry context) |
+| `retry_total_delay_ms`     | int    | Cumulative backoff sleep BEFORE this attempt started. Undefined for direct calls; 0 for attempt 1; > 0 for subsequent retried attempts   |
+| `ttft_ms`                  | int    | TTFT per D1; undefined for non-streaming or aborted-before-first-chunk requests                                                          |
+| `request_setup_ms`         | int    | Per D3                                                                                                                                   |
+| `sampling_ms`              | int    | Per D6                                                                                                                                   |
+| `output_tokens_per_second` | double | Derived; `output_tokens / (sampling_ms / 1000)`; undefined when `sampling_ms === 0`                                                      |
 
 Per-attempt status-code distribution (e.g., "2 of the 3 attempts were 429s") is queryable from log-bridge spans of `ApiRetryEvent` records. No need to duplicate it as a flattened attribute on the parent.
 
diff --git a/packages/core/src/core/baseLlmClient.ts b/packages/core/src/core/baseLlmClient.ts
index b2e56596b99..bda5eb854cb 100644
--- a/packages/core/src/core/baseLlmClient.ts
+++ b/packages/core/src/core/baseLlmClient.ts
@@ -27,6 +27,9 @@ import {
 import { reportError } from '../utils/errorReporting.js';
 import { getErrorMessage } from '../utils/errors.js';
 import { retryWithBackoff, isUnattendedMode } from '../utils/retry.js';
+import { subagentNameContext } from '../utils/subagentNameContext.js';
+import { ApiRetryEvent } from '../telemetry/types.js';
+import { logApiRetry } from '../telemetry/loggers.js';
 import { getFunctionCalls } from '../utils/generateContentResponseUtilities.js';
 import { getResponseText } from '../utils/partUtils.js';
 import { createDebugLogger } from '../utils/debugLogger.js';
@@ -242,6 +245,20 @@ export class BaseLlmClient {
             `[qwen-code] Waiting for API capacity... attempt ${info.attempt}, retry in ${Math.ceil(info.remainingMs / 1000)}s\n`,
           );
         },
+        onRetry: (info) => {
+          logApiRetry(
+            this.config,
+            new ApiRetryEvent({
+              model: requestModel,
+              promptId,
+              attemptNumber: info.attempt,
+              error: info.error,
+              statusCode: info.errorStatus,
+              retryDelayMs: info.delayMs,
+              subagentName: subagentNameContext.getStore(),
+            }),
+          );
+        },
       });
 
       const functionCalls = getFunctionCalls(result);
@@ -340,6 +357,20 @@ export class BaseLlmClient {
             `[qwen-code] Waiting for API capacity... attempt ${info.attempt}, retry in ${Math.ceil(info.remainingMs / 1000)}s\n`,
           );
         },
+        onRetry: (info) => {
+          logApiRetry(
+            this.config,
+            new ApiRetryEvent({
+              model: requestModel,
+              promptId,
+              attemptNumber: info.attempt,
+              error: info.error,
+              statusCode: info.errorStatus,
+              retryDelayMs: info.delayMs,
+              subagentName: subagentNameContext.getStore(),
+            }),
+          );
+        },
       });
 
       return {
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index 103f4e8b889..46f4c22866d 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -94,7 +94,10 @@ import {
 } from '../utils/partUtils.js';
 import { promptIdContext } from '../utils/promptIdContext.js';
 import { retryWithBackoff, isUnattendedMode } from '../utils/retry.js';
+import { subagentNameContext } from '../utils/subagentNameContext.js';
 import { escapeSystemReminderTags } from '../utils/xml.js';
+import { ApiRetryEvent } from '../telemetry/types.js';
+import { logApiRetry } from '../telemetry/loggers.js';
 
 // Hook types and utilities
 import {
@@ -2096,6 +2099,24 @@ export class GeminiClient {
             `[qwen-code] Waiting for API capacity... attempt ${info.attempt}, retry in ${Math.ceil(info.remainingMs / 1000)}s\n`,
           );
         },
+        // Phase 4b — emit ApiRetryEvent telemetry for HTTP-status retries.
+        // subagent_name read from subagentNameContext (active in catch block
+        // since the entire generateContent invocation runs inside the parent
+        // subagent's ALS frame when applicable).
+        onRetry: (info) => {
+          logApiRetry(
+            this.config,
+            new ApiRetryEvent({
+              model: currentAttemptModel,
+              promptId,
+              attemptNumber: info.attempt,
+              error: info.error,
+              statusCode: info.errorStatus,
+              retryDelayMs: info.delayMs,
+              subagentName: subagentNameContext.getStore(),
+            }),
+          );
+        },
       });
       return result;
     } catch (error: unknown) {
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 9d1ec8b44cf..6c17eec1272 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -40,8 +40,10 @@ import type { StructuredError } from './turn.js';
 import {
   logContentRetry,
   logContentRetryFailure,
+  logApiRetry,
 } from '../telemetry/loggers.js';
 import { clearDetailedSpanState } from '../telemetry/detailed-span-attributes.js';
+import { subagentNameContext } from '../utils/subagentNameContext.js';
 import { type ChatRecordingService } from '../services/chatRecordingService.js';
 import {
   ChatCompressionService,
@@ -54,6 +56,7 @@ import { estimatePromptTokens } from '../services/tokenEstimation.js';
 import {
   ContentRetryEvent,
   ContentRetryFailureEvent,
+  ApiRetryEvent,
 } from '../telemetry/types.js';
 import type { UiTelemetryService } from '../telemetry/uiTelemetry.js';
 import { type ChatCompressionInfo, CompressionStatus } from './turn.js';
@@ -2346,6 +2349,20 @@ export class GeminiChat {
           `[qwen-code] Waiting for API capacity... attempt ${info.attempt}, retry in ${Math.ceil(info.remainingMs / 1000)}s\n`,
         );
       },
+      onRetry: (info) => {
+        logApiRetry(
+          this.config,
+          new ApiRetryEvent({
+            model,
+            promptId: prompt_id,
+            attemptNumber: info.attempt,
+            error: info.error,
+            statusCode: info.errorStatus,
+            retryDelayMs: info.delayMs,
+            subagentName: subagentNameContext.getStore(),
+          }),
+        );
+      },
     });
 
     return this.processStreamResponse(model, streamResponse);
diff --git a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.test.ts b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.test.ts
index 570cc9ae59c..bfaa49b27ff 100644
--- a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.test.ts
+++ b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.test.ts
@@ -40,6 +40,11 @@ const loggingSpanRecords = vi.hoisted(
       success?: boolean;
       inputTokens?: number;
       outputTokens?: number;
+      cachedInputTokens?: number;
+      ttftMs?: number;
+      requestSetupMs?: number;
+      attempt?: number;
+      retryTotalDelayMs?: number;
       durationMs?: number;
       error?: string;
     };
@@ -2040,3 +2045,265 @@ describe('LoggingContentGenerator', () => {
     },
   );
 });
+
+// =========================================================================
+// Phase 4b — retryContext ALS propagation into LoggingContentGenerator.
+// Asserts the contract: when the LLM call runs inside a retryContext.run()
+// frame, endLLMRequestSpan receives the frame's values. When no frame is
+// present (warmup, side-query, direct call), `attempt` defaults to 1 and
+// requestSetupMs/retryTotalDelayMs stay undefined.
+// =========================================================================
+describe('LoggingContentGenerator — Phase 4b retry context propagation', () => {
+  beforeEach(() => {
+    loggingSpanRecords.length = 0;
+    vi.mocked(logApiRequest).mockClear();
+    vi.mocked(logApiResponse).mockClear();
+    vi.mocked(logApiError).mockClear();
+  });
+
+  it('non-stream: forwards retryContext.attempt/requestSetupMs/retryTotalDelayMs to endLLMRequestSpan', async () => {
+    const { retryContext } = await import('../../utils/retryContext.js');
+
+    const wrapped = createWrappedGenerator(
+      vi.fn().mockResolvedValue(
+        createResponse('r-1', 'test-model', [{ text: 'ok' }], {
+          promptTokenCount: 10,
+          candidatesTokenCount: 5,
+          totalTokenCount: 15,
+        }),
+      ),
+      vi.fn(),
+    );
+    const generator = new LoggingContentGenerator(wrapped, createConfig(), {
+      model: 'test-model',
+      authType: AuthType.USE_OPENAI,
+      enableOpenAILogging: false,
+    });
+    const request = {
+      model: 'test-model',
+      contents: 'Hello',
+    } as unknown as GenerateContentParameters;
+
+    // Simulate being invoked from within `retryWithBackoff`'s ALS frame —
+    // the LoggingContentGenerator must read these values and forward them.
+    await retryContext.run(
+      { attempt: 3, requestSetupMs: 1200, retryTotalDelayMs: 1000 },
+      async () => {
+        await generator.generateContent(request, 'prompt-retry');
+      },
+    );
+
+    const record = getGenerateContentSpanRecord();
+    expect(record.endMetadata).toMatchObject({
+      success: true,
+      attempt: 3,
+      requestSetupMs: 1200,
+      retryTotalDelayMs: 1000,
+    });
+  });
+
+  it('non-stream: defaults attempt=1 and leaves setup/delay undefined when no retry context (direct call / warmup)', async () => {
+    const wrapped = createWrappedGenerator(
+      vi.fn().mockResolvedValue(
+        createResponse('r-2', 'test-model', [{ text: 'ok' }], {
+          promptTokenCount: 10,
+          candidatesTokenCount: 5,
+          totalTokenCount: 15,
+        }),
+      ),
+      vi.fn(),
+    );
+    const generator = new LoggingContentGenerator(wrapped, createConfig(), {
+      model: 'test-model',
+      authType: AuthType.USE_OPENAI,
+      enableOpenAILogging: false,
+    });
+    const request = {
+      model: 'test-model',
+      contents: 'Hello',
+    } as unknown as GenerateContentParameters;
+
+    // No retryContext.run() — direct invocation.
+    await generator.generateContent(request, 'prompt-direct');
+
+    const record = getGenerateContentSpanRecord();
+    const meta = record.endMetadata as {
+      attempt?: number;
+      requestSetupMs?: number;
+      retryTotalDelayMs?: number;
+    };
+    expect(meta.attempt).toBe(1);
+    expect(meta.requestSetupMs).toBeUndefined();
+    expect(meta.retryTotalDelayMs).toBeUndefined();
+  });
+
+  it('stream: snapshots retry context in synchronous prelude and forwards through stream wrapper finally block', async () => {
+    const { retryContext } = await import('../../utils/retryContext.js');
+
+    const streamFn = vi.fn().mockResolvedValue(
+      (async function* () {
+        yield createResponse('r-s1', 'test-model', [{ text: 'a' }]);
+        yield createResponse('r-s2', 'test-model', [{ text: 'b' }], {
+          promptTokenCount: 50,
+          candidatesTokenCount: 20,
+          totalTokenCount: 70,
+        });
+      })(),
+    );
+    const wrapped = createWrappedGenerator(vi.fn(), streamFn);
+    const generator = new LoggingContentGenerator(wrapped, createConfig(), {
+      model: 'test-model',
+      authType: AuthType.USE_OPENAI,
+      enableOpenAILogging: false,
+    });
+    const request = {
+      model: 'test-model',
+      contents: 'Hello',
+    } as unknown as GenerateContentParameters;
+
+    // Critical: the stream wrapper is iterated AFTER retryContext.run resolves
+    // its synchronous body. The closure-captured snapshot must carry values
+    // through to the finally block's endLLMRequestSpan call.
+    await retryContext.run(
+      { attempt: 2, requestSetupMs: 500, retryTotalDelayMs: 400 },
+      async () => {
+        const stream = await generator.generateContentStream(
+          request,
+          'prompt-retry-stream',
+        );
+        for await (const _ of stream) {
+          // consume
+        }
+      },
+    );
+
+    const record = getStreamSpanRecord();
+    expect(record.endMetadata).toMatchObject({
+      success: true,
+      attempt: 2,
+      requestSetupMs: 500,
+      retryTotalDelayMs: 400,
+      inputTokens: 50,
+      outputTokens: 20,
+    });
+  });
+
+  it('stream: defaults attempt=1 when iterated outside any retry frame', async () => {
+    const streamFn = vi.fn().mockResolvedValue(
+      (async function* () {
+        yield createResponse('r-s3', 'test-model', [{ text: 'a' }]);
+      })(),
+    );
+    const wrapped = createWrappedGenerator(vi.fn(), streamFn);
+    const generator = new LoggingContentGenerator(wrapped, createConfig(), {
+      model: 'test-model',
+      authType: AuthType.USE_OPENAI,
+      enableOpenAILogging: false,
+    });
+    const request = {
+      model: 'test-model',
+      contents: 'Hello',
+    } as unknown as GenerateContentParameters;
+
+    const stream = await generator.generateContentStream(
+      request,
+      'prompt-stream-direct',
+    );
+    for await (const _ of stream) {
+      // consume
+    }
+
+    const record = getStreamSpanRecord();
+    const meta = record.endMetadata as {
+      attempt?: number;
+      requestSetupMs?: number;
+      retryTotalDelayMs?: number;
+    };
+    expect(meta.attempt).toBe(1);
+    expect(meta.requestSetupMs).toBeUndefined();
+    expect(meta.retryTotalDelayMs).toBeUndefined();
+  });
+
+  it('stream idle-timeout path: retrySnapshot propagates to the setTimeout-fired endLLMRequestSpan (R2 #8)', async () => {
+    // Review comment R2 #8: the idle-timeout `setTimeout` fires in a separate
+    // macrotask. Verify the closure-captured retrySnapshot reaches its
+    // endLLMRequestSpan call with correct retry context values.
+    //
+    // Must use fake timers from the START so the 5-min setTimeout created
+    // inside loggingStreamWrapper uses the fake clock and can be advanced.
+    vi.useFakeTimers();
+
+    const { retryContext } = await import('../../utils/retryContext.js');
+
+    // Stream that resolves its first .next() only after we've advanced
+    // timers past the idle timeout. We use a deferred promise to hold
+    // iteration without actually hanging the test runner.
+    let releaseStream: () => void;
+    const streamBlocker = new Promise<void>((r) => {
+      releaseStream = r;
+    });
+    const neverYieldStream = (async function* () {
+      await streamBlocker; // holds until we release after timer advance
+      yield createResponse('never', 'test-model', [{ text: 'x' }]);
+    })();
+
+    const streamFn = vi.fn().mockResolvedValue(neverYieldStream);
+    const wrapped = createWrappedGenerator(vi.fn(), streamFn);
+    const generator = new LoggingContentGenerator(wrapped, createConfig(), {
+      model: 'test-model',
+      authType: AuthType.USE_OPENAI,
+      enableOpenAILogging: false,
+    });
+    const request = {
+      model: 'test-model',
+      contents: 'Hello',
+    } as unknown as GenerateContentParameters;
+
+    // Start the stream inside a retry context. The generator creation
+    // (generateContentStream) runs synchronously enough to capture the
+    // retrySnapshot. The consumer's first .next() call starts the for-await
+    // which immediately awaits the streamBlocker — at that point the idle
+    // timeout setTimeout(5min) is already scheduled.
+    await retryContext.run(
+      { attempt: 4, requestSetupMs: 3000, retryTotalDelayMs: 2500 },
+      async () => {
+        const stream = await generator.generateContentStream(
+          request,
+          'prompt-idle-timeout',
+        );
+        const iter = stream[Symbol.asyncIterator]();
+        // Start iteration — this enters for-await, resets the idle timer,
+        // then blocks on streamBlocker.
+        void iter.next();
+      },
+    );
+
+    // Advance past the 5-minute idle timeout (STREAM_IDLE_TIMEOUT_MS)
+    await vi.advanceTimersByTimeAsync(6 * 60_000);
+
+    // Release the stream so the generator can clean up
+    releaseStream!();
+    await vi.advanceTimersByTimeAsync(100);
+
+    vi.useRealTimers();
+
+    // Find the span that was ended by the idle timeout
+    const records = loggingSpanRecords.filter(
+      (r) => r.name === 'qwen-code.llm_request' && r.endMetadata !== undefined,
+    );
+    const timeoutRecord = records.find(
+      (r) => r.endMetadata?.error === 'Stream span timed out (idle)',
+    );
+    expect(timeoutRecord).toBeDefined();
+    const meta = timeoutRecord!.endMetadata as {
+      attempt?: number;
+      requestSetupMs?: number;
+      retryTotalDelayMs?: number;
+      error?: string;
+    };
+    expect(meta.attempt).toBe(4);
+    expect(meta.requestSetupMs).toBe(3000);
+    expect(meta.retryTotalDelayMs).toBe(2500);
+    expect(meta.error).toBe('Stream span timed out (idle)');
+  });
+});
diff --git a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts
index 9a20db8f252..51257ac9ceb 100644
--- a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts
+++ b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts
@@ -62,6 +62,34 @@ import {
   API_CALL_FAILED_SPAN_STATUS_MESSAGE,
 } from '../../telemetry/tracer.js';
 import { hasUserVisibleContent } from './streamContentDetection.js';
+import {
+  retryContext,
+  type RetryAttemptContext,
+} from '../../utils/retryContext.js';
+
+/**
+ * Phase 4b — read the active retry context once, default attempt to 1 when
+ * absent (warmup/side-queries/direct calls). Returns the fields in the exact
+ * shape consumed by `endLLMRequestSpan` so callers can spread the result.
+ *
+ * Called in the SYNCHRONOUS PRELUDE of `generateContent` / `generateContentStream`
+ * — before the first await — because the streaming path returns an
+ * AsyncGenerator that's iterated AFTER `retryWithBackoff` has resolved and
+ * the ALS frame has exited. The closure carries this snapshot to all later
+ * endLLMRequestSpan callsites (success / error / idle-timeout / abort).
+ */
+function snapshotRetryMetadata(): {
+  attempt: number;
+  requestSetupMs?: number;
+  retryTotalDelayMs?: number;
+} {
+  const ctx: RetryAttemptContext | undefined = retryContext.getStore();
+  return {
+    attempt: ctx?.attempt ?? 1,
+    requestSetupMs: ctx?.requestSetupMs,
+    retryTotalDelayMs: ctx?.retryTotalDelayMs,
+  };
+}
 
 const debugLogger = createDebugLogger('LOGGING_CONTENT_GENERATOR');
 
@@ -213,6 +241,10 @@ export class LoggingContentGenerator implements ContentGenerator {
     req: GenerateContentParameters,
     userPromptId: string,
   ): Promise<GenerateContentResponse> {
+    // Phase 4b — snapshot retry context in the synchronous prelude BEFORE any
+    // await. ALS frame from `retryWithBackoff` is guaranteed to be active here.
+    const retrySnapshot = snapshotRetryMetadata();
+
     const llmSpan = startLLMRequestSpan(req.model, userPromptId);
     try {
       llmSpan.setAttribute('llm_request.stream', false);
@@ -288,6 +320,7 @@ export class LoggingContentGenerator implements ContentGenerator {
         outputTokens: response.usageMetadata?.candidatesTokenCount,
         cachedInputTokens: response.usageMetadata?.cachedContentTokenCount,
         durationMs: Date.now() - startTime,
+        ...retrySnapshot,
       });
       return response;
     } catch (error) {
@@ -304,6 +337,7 @@ export class LoggingContentGenerator implements ContentGenerator {
         error: aborted
           ? API_CALL_ABORTED_SPAN_STATUS_MESSAGE
           : API_CALL_FAILED_SPAN_STATUS_MESSAGE,
+        ...retrySnapshot,
       });
       await context.with(spanContext, async () => {
         this.safelyLogApiError('', durationMs, error, req.model, userPromptId);
@@ -326,6 +360,15 @@ export class LoggingContentGenerator implements ContentGenerator {
     req: GenerateContentParameters,
     userPromptId: string,
   ): Promise<AsyncGenerator<GenerateContentResponse>> {
+    // Phase 4b — snapshot retry context in the synchronous prelude. This is
+    // the only point where the ALS frame from `retryWithBackoff` is guaranteed
+    // to be active for the streaming path: once this function returns the
+    // AsyncGenerator, the caller iterates AFTER `retryWithBackoff` has
+    // resolved and the frame has exited. Threaded as a parameter to
+    // loggingStreamWrapper so its closure carries the snapshot to all later
+    // endLLMRequestSpan callsites (success / error / idle-timeout / abort).
+    const retrySnapshot = snapshotRetryMetadata();
+
     const llmSpan = startLLMRequestSpan(req.model, userPromptId);
     try {
       llmSpan.setAttribute('llm_request.stream', true);
@@ -383,6 +426,7 @@ export class LoggingContentGenerator implements ContentGenerator {
         error: aborted
           ? API_CALL_ABORTED_SPAN_STATUS_MESSAGE
           : API_CALL_FAILED_SPAN_STATUS_MESSAGE,
+        ...retrySnapshot,
       });
       try {
         await this.safelyLogOpenAIInteraction(
@@ -416,6 +460,7 @@ export class LoggingContentGenerator implements ContentGenerator {
         llmSpan,
         spanContext,
         req.config?.abortSignal,
+        retrySnapshot,
       ),
     );
   }
@@ -451,6 +496,12 @@ export class LoggingContentGenerator implements ContentGenerator {
     span?: Span,
     spanContext?: Context,
     abortSignal?: AbortSignal,
+    // Phase 4b — snapshot of retry context captured BEFORE the stream wrapper
+    // returned, when the ALS frame from `retryWithBackoff` was still active.
+    // Closure-carried to every endLLMRequestSpan callsite below so the
+    // idle-timeout `setTimeout` callback sees the same values as the
+    // entry-time read.
+    retrySnapshot?: ReturnType<typeof snapshotRetryMetadata>,
   ): AsyncGenerator<GenerateContentResponse> {
     const isInternal = isInternalPromptId(userPromptId);
     // Skip collecting full responses for internal prompts to avoid memory
@@ -504,6 +555,7 @@ export class LoggingContentGenerator implements ContentGenerator {
               success: false,
               durationMs: Date.now() - startTime,
               error: 'Stream span timed out (idle)',
+              ...retrySnapshot,
             });
             spanEndedByTimeout = true;
           }, STREAM_IDLE_TIMEOUT_MS);
@@ -626,6 +678,7 @@ export class LoggingContentGenerator implements ContentGenerator {
               ? API_CALL_ABORTED_SPAN_STATUS_MESSAGE
               : API_CALL_FAILED_SPAN_STATUS_MESSAGE
             : undefined,
+          ...retrySnapshot,
         });
       }
     }
diff --git a/packages/core/src/telemetry/constants.ts b/packages/core/src/telemetry/constants.ts
index 8ef41eaef83..f69ab170285 100644
--- a/packages/core/src/telemetry/constants.ts
+++ b/packages/core/src/telemetry/constants.ts
@@ -29,6 +29,11 @@ export const EVENT_INVALID_CHUNK = 'qwen-code.chat.invalid_chunk';
 export const EVENT_CONTENT_RETRY = 'qwen-code.chat.content_retry';
 export const EVENT_CONTENT_RETRY_FAILURE =
   'qwen-code.chat.content_retry_failure';
+// Phase 4b — HTTP-status retry telemetry emitted by `retryWithBackoff` for
+// 429 / 5xx errors at LLM call sites. Distinct from EVENT_CONTENT_RETRY,
+// which is fired by geminiChat for InvalidStreamError retries on a separate
+// retry budget. See docs/design/telemetry-llm-request-timing-design.md.
+export const EVENT_API_RETRY = 'qwen-code.api_retry';
 export const EVENT_CONVERSATION_FINISHED = 'qwen-code.conversation_finished';
 export const EVENT_MALFORMED_JSON_RESPONSE =
   'qwen-code.malformed_json_response';
diff --git a/packages/core/src/telemetry/index.ts b/packages/core/src/telemetry/index.ts
index cd8aad316b6..fce64dbfc3b 100644
--- a/packages/core/src/telemetry/index.ts
+++ b/packages/core/src/telemetry/index.ts
@@ -107,6 +107,7 @@ export {
   recordInvalidChunk,
   recordContentRetry,
   recordContentRetryFailure,
+  recordApiRetry,
   // Performance monitoring functions
   recordStartupPerformance,
   recordMemoryUsage,
diff --git a/packages/core/src/telemetry/loggers.test.ts b/packages/core/src/telemetry/loggers.test.ts
index b93d7e930af..b8632a4326d 100644
--- a/packages/core/src/telemetry/loggers.test.ts
+++ b/packages/core/src/telemetry/loggers.test.ts
@@ -58,6 +58,7 @@ import {
   logExtensionUninstall,
   logHookCall,
   logApiError,
+  logApiRetry,
 } from './loggers.js';
 import * as metrics from './metrics.js';
 import { QwenLogger } from './qwen-logger/qwen-logger.js';
@@ -82,6 +83,7 @@ import {
   ExtensionUninstallEvent,
   HookCallEvent,
   ApiErrorEvent,
+  ApiRetryEvent,
 } from './types.js';
 import { FileOperation } from './metrics.js';
 import type {
@@ -1694,4 +1696,86 @@ describe('loggers', () => {
       expect(passedEvent).toBe(event);
     });
   });
+
+  // Phase 4b — logApiRetry: HTTP-status retry telemetry from retryWithBackoff.
+  describe('logApiRetry (Phase 4b)', () => {
+    const mockQwenLogger = {
+      logApiRetryEvent: vi.fn(),
+    };
+
+    beforeEach(() => {
+      vi.spyOn(QwenLogger, 'getInstance').mockReturnValue(
+        mockQwenLogger as unknown as QwenLogger,
+      );
+      mockQwenLogger.logApiRetryEvent.mockClear();
+      vi.spyOn(metrics, 'recordApiRetry');
+    });
+
+    function buildEvent(
+      overrides: Partial<{
+        model: string;
+        promptId: string;
+        attemptNumber: number;
+        status: number;
+        delay: number;
+        errorMsg: string;
+        subagentName: string;
+      }> = {},
+    ): ApiRetryEvent {
+      const err = new Error(overrides.errorMsg ?? 'rate limited');
+      return new ApiRetryEvent({
+        model: overrides.model ?? 'qwen3',
+        promptId: overrides.promptId ?? 'p-1',
+        attemptNumber: overrides.attemptNumber ?? 2,
+        error: err,
+        statusCode: overrides.status ?? 429,
+        retryDelayMs: overrides.delay ?? 1500,
+        subagentName: overrides.subagentName,
+      });
+    }
+
+    it('fans out to all 3 sinks: QwenLogger, OTel log, and metric counter', () => {
+      const mockConfig = makeFakeConfig({ sessionId: 'test-session-id' });
+      const event = buildEvent();
+      logApiRetry(mockConfig, event);
+
+      // 1. QwenLogger RUM
+      expect(mockQwenLogger.logApiRetryEvent).toHaveBeenCalledWith(event);
+      // 2. OTel log signal — picked up by LogToSpanProcessor to bridge as span
+      expect(mockLogger.emit).toHaveBeenCalledTimes(1);
+      const logRecord = mockLogger.emit.mock.calls[0][0];
+      expect(logRecord.body).toContain('API retry attempt 2');
+      expect(logRecord.body).toContain('qwen3');
+      expect(logRecord.body).toContain('status 429');
+      expect(logRecord.attributes['event.name']).toBe('qwen-code.api_retry');
+      expect(logRecord.attributes['attempt_number']).toBe(2);
+      expect(logRecord.attributes['retry_delay_ms']).toBe(1500);
+      expect(logRecord.attributes['status_code']).toBe(429);
+      expect(logRecord.attributes['model']).toBe('qwen3');
+      // 3. Metric counter — tagged with {model}
+      expect(metrics.recordApiRetry).toHaveBeenCalledWith(mockConfig, {
+        model: 'qwen3',
+      });
+    });
+
+    it('propagates subagent_name when present', () => {
+      const mockConfig = makeFakeConfig({ sessionId: 'test-session-id' });
+      const event = buildEvent({ subagentName: 'explore-agent' });
+      logApiRetry(mockConfig, event);
+
+      const logRecord = mockLogger.emit.mock.calls[0][0];
+      expect(logRecord.attributes['subagent_name']).toBe('explore-agent');
+    });
+
+    it('skips logger.emit and metric counter when SDK is not initialized (QwenLogger still called)', () => {
+      vi.spyOn(sdk, 'isTelemetrySdkInitialized').mockReturnValue(false);
+      const mockConfig = makeFakeConfig({ sessionId: 'test-session-id' });
+      const event = buildEvent();
+      logApiRetry(mockConfig, event);
+
+      expect(mockQwenLogger.logApiRetryEvent).toHaveBeenCalledWith(event);
+      expect(mockLogger.emit).not.toHaveBeenCalled();
+      expect(metrics.recordApiRetry).not.toHaveBeenCalled();
+    });
+  });
 });
diff --git a/packages/core/src/telemetry/loggers.ts b/packages/core/src/telemetry/loggers.ts
index b45798fc7cd..716aa0e0cff 100644
--- a/packages/core/src/telemetry/loggers.ts
+++ b/packages/core/src/telemetry/loggers.ts
@@ -30,6 +30,7 @@ import {
   EVENT_CHAT_COMPRESSION,
   EVENT_CONTENT_RETRY,
   EVENT_CONTENT_RETRY_FAILURE,
+  EVENT_API_RETRY,
   EVENT_FILE_OPERATION,
   EVENT_RIPGREP_FALLBACK,
   EVENT_EXTENSION_INSTALL,
@@ -57,6 +58,7 @@ import {
   recordChatCompressionMetrics,
   recordContentRetry,
   recordContentRetryFailure,
+  recordApiRetry,
   recordFileOperationMetric,
   recordInvalidChunk,
   recordModelSlashCommand,
@@ -93,6 +95,7 @@ import type {
   ChatCompressionEvent,
   ContentRetryEvent,
   ContentRetryFailureEvent,
+  ApiRetryEvent,
   RipgrepFallbackEvent,
   ToolOutputTruncatedEvent,
   ExtensionDisableEvent,
@@ -756,6 +759,38 @@ export function logContentRetryFailure(
   recordContentRetryFailure(config);
 }
 
+/**
+ * Phase 4b — Emits an HTTP-status retry event fired from `retryWithBackoff`
+ * at an LLM call site (via the `onRetry` callback opt-in). Distinct from
+ * `logContentRetry`, which is fired by `geminiChat`'s content-recovery loop.
+ *
+ * Three-sink fan-out, matching the `logContentRetry` shape exactly:
+ *   1. QwenLogger RUM ingestion (Aliyun internal stats)
+ *   2. OTel log signal via `logger.emit()` — picked up by LogToSpanProcessor
+ *      and bridged to a span sibling under the caller's active span (typically
+ *      interaction or tool, NOT the failed LLM span — that span has already
+ *      ended by the time onRetry fires).
+ *   3. `recordApiRetry` Counter increment for per-model retry-rate dashboards.
+ */
+export function logApiRetry(config: Config, event: ApiRetryEvent): void {
+  QwenLogger.getInstance(config)?.logApiRetryEvent(event);
+  if (!isTelemetrySdkInitialized()) return;
+
+  const attributes: LogAttributes = {
+    ...getCommonAttributes(config),
+    ...event,
+    'event.name': EVENT_API_RETRY,
+  };
+
+  const logger = logs.getLogger(SERVICE_NAME);
+  const logRecord: LogRecord = {
+    body: `API retry attempt ${event.attempt_number} for ${event.model} (status ${event.status_code ?? 'unknown'}).`,
+    attributes,
+  };
+  logger.emit(logRecord);
+  recordApiRetry(config, { model: event.model });
+}
+
 export function logSubagentExecution(
   config: Config,
   event: SubagentExecutionEvent,
diff --git a/packages/core/src/telemetry/metrics.ts b/packages/core/src/telemetry/metrics.ts
index 7d9de142ee5..e6c47efd50c 100644
--- a/packages/core/src/telemetry/metrics.ts
+++ b/packages/core/src/telemetry/metrics.ts
@@ -20,6 +20,9 @@ const FILE_OPERATION_COUNT = `${SERVICE_NAME}.file.operation.count`;
 const INVALID_CHUNK_COUNT = `${SERVICE_NAME}.chat.invalid_chunk.count`;
 const CONTENT_RETRY_COUNT = `${SERVICE_NAME}.chat.content_retry.count`;
 const CONTENT_RETRY_FAILURE_COUNT = `${SERVICE_NAME}.chat.content_retry_failure.count`;
+// Phase 4b — Counts HTTP-status retries emitted by `retryWithBackoff` at LLM
+// call sites. Tagged by `model` so operators can graph per-model retry rate.
+const API_RETRY_COUNT = `${SERVICE_NAME}.api.retry.count`;
 const MODEL_SLASH_COMMAND_CALL_COUNT = `${SERVICE_NAME}.slash_command.model.call_count`;
 export const SUBAGENT_EXECUTION_COUNT = `${SERVICE_NAME}.subagent.execution.count`;
 
@@ -135,6 +138,15 @@ const COUNTER_DEFINITIONS = {
     assign: (c: Counter) => (contentRetryFailureCounter = c),
     attributes: {} as Record<string, never>,
   },
+  [API_RETRY_COUNT]: {
+    description:
+      'Counts HTTP-status retries (429/5xx) at LLM call sites, emitted by retryWithBackoff onRetry callback.',
+    valueType: ValueType.INT,
+    assign: (c: Counter) => (apiRetryCounter = c),
+    attributes: {} as {
+      model: string;
+    },
+  },
   [MODEL_SLASH_COMMAND_CALL_COUNT]: {
     description: 'Counts model slash command calls.',
     valueType: ValueType.INT,
@@ -356,6 +368,7 @@ let chatCompressionCounter: Counter | undefined;
 let invalidChunkCounter: Counter | undefined;
 let contentRetryCounter: Counter | undefined;
 let contentRetryFailureCounter: Counter | undefined;
+let apiRetryCounter: Counter | undefined;
 let subagentExecutionCounter: Counter | undefined;
 let modelSlashCommandCallCounter: Counter | undefined;
 
@@ -626,6 +639,23 @@ export function recordContentRetryFailure(config: Config): void {
   );
 }
 
+/**
+ * Phase 4b — Records a metric for an HTTP-status retry at an LLM call site.
+ * Tagged by `model` so operators can graph per-model retry rate. Called from
+ * `logApiRetry` in loggers.ts which is wired to `retryWithBackoff`'s `onRetry`
+ * callback at the 4 LLM call sites.
+ */
+export function recordApiRetry(
+  config: Config,
+  attributes: MetricDefinitions[typeof API_RETRY_COUNT]['attributes'],
+): void {
+  if (!apiRetryCounter || !isMetricsInitialized) return;
+  apiRetryCounter.add(1, {
+    ...baseMetricDefinition.getCommonAttributes(config),
+    ...attributes,
+  });
+}
+
 export function recordModelSlashCommand(
   config: Config,
   event: ModelSlashCommandEvent,
diff --git a/packages/core/src/telemetry/qwen-logger/qwen-logger.ts b/packages/core/src/telemetry/qwen-logger/qwen-logger.ts
index b1a487ce675..0c932de84b8 100644
--- a/packages/core/src/telemetry/qwen-logger/qwen-logger.ts
+++ b/packages/core/src/telemetry/qwen-logger/qwen-logger.ts
@@ -31,6 +31,7 @@ import type {
   ChatCompressionEvent,
   InvalidChunkEvent,
   ContentRetryEvent,
+  ApiRetryEvent,
   ContentRetryFailureEvent,
   ConversationFinishedEvent,
   SubagentExecutionEvent,
@@ -958,6 +959,26 @@ export class QwenLogger {
     this.flushIfNeeded();
   }
 
+  // Phase 4b — HTTP-status retry from retryWithBackoff (429/5xx). Distinct from
+  // logContentRetryEvent which is fired by geminiChat's content-recovery loop.
+  logApiRetryEvent(event: ApiRetryEvent): void {
+    const rumEvent = this.createActionEvent('misc', 'api_retry', {
+      properties: {
+        model: event.model,
+        prompt_id: event.prompt_id ?? '',
+        attempt_number: event.attempt_number,
+        error_type: event.error_type ?? 'unknown',
+        status_code:
+          event.status_code !== undefined ? String(event.status_code) : '',
+        retry_delay_ms: event.retry_delay_ms,
+        subagent_name: event.subagent_name ?? '',
+      },
+    });
+
+    this.enqueueLogEvent(rumEvent);
+    this.flushIfNeeded();
+  }
+
   // arena events
   logArenaSessionStartedEvent(event: ArenaSessionStartedEvent): void {
     const rumEvent = this.createActionEvent('arena', 'arena_session_started', {
diff --git a/packages/core/src/telemetry/session-tracing.test.ts b/packages/core/src/telemetry/session-tracing.test.ts
index 7a40c304532..9cd046026bc 100644
--- a/packages/core/src/telemetry/session-tracing.test.ts
+++ b/packages/core/src/telemetry/session-tracing.test.ts
@@ -521,33 +521,42 @@ describe('session-tracing', () => {
         durationMs: 1000,
       });
 
-      // sampling_ms = duration - ttft - (requestSetup ?? 0) = 1000 - 200 - 0
+      // sampling_ms = duration - ttft = 1000 - 200 (setup is NOT subtracted —
+      // duration_ms only covers ttft + sampling, never the setup phase that
+      // precedes the span. See Phase 4b commit fixing the formula bug.)
       expect(mockSpans[0]!.attributes['sampling_ms']).toBe(800);
     });
 
-    it('endLLMRequestSpan derives sampling_ms accounting for requestSetupMs (Phase 4b populates this)', () => {
+    it('endLLMRequestSpan does NOT subtract requestSetupMs from sampling_ms (Phase 4b bug fix)', () => {
+      // Phase 4a's formula `duration - ttft - setup` double-counted setup
+      // because duration_ms ALREADY excludes setup (span starts after setup).
+      // Phase 4b populates requestSetupMs with cumulative retry overhead —
+      // if the formula still subtracted setup, sampling_ms would clamp to 0
+      // for every retried request, wiping output-throughput data.
       const span = startLLMRequestSpan('m', 'p');
       endLLMRequestSpan(span, {
         success: true,
         ttftMs: 200,
-        requestSetupMs: 300,
+        requestSetupMs: 300, // would yield 500 under old formula; we want 800
         durationMs: 1000,
       });
 
-      // sampling_ms = 1000 - 200 - 300
-      expect(mockSpans[0]!.attributes['sampling_ms']).toBe(500);
+      expect(mockSpans[0]!.attributes['sampling_ms']).toBe(800);
+      // request_setup_ms is still emitted as its own attribute — operators can
+      // see the retry overhead AND the sampling time independently.
+      expect(mockSpans[0]!.attributes['request_setup_ms']).toBe(300);
     });
 
-    it('endLLMRequestSpan clamps sampling_ms to 0 when ttft + setup exceed duration (clock skew)', () => {
+    it('endLLMRequestSpan clamps sampling_ms to 0 when ttft exceeds duration (clock skew)', () => {
       const span = startLLMRequestSpan('m', 'p');
       endLLMRequestSpan(span, {
         success: true,
-        ttftMs: 800,
-        requestSetupMs: 500,
+        ttftMs: 1500,
         durationMs: 1000,
       });
 
-      // Math.max(0, 1000 - 800 - 500) = 0
+      // Math.max(0, 1000 - 1500) = 0 — only triggers when ttft > duration,
+      // which in practice means clock drift or a measurement bug.
       expect(mockSpans[0]!.attributes['sampling_ms']).toBe(0);
     });
 
diff --git a/packages/core/src/telemetry/session-tracing.ts b/packages/core/src/telemetry/session-tracing.ts
index ee53a56091d..c62f65fe32e 100644
--- a/packages/core/src/telemetry/session-tracing.ts
+++ b/packages/core/src/telemetry/session-tracing.ts
@@ -70,20 +70,27 @@ export interface LLMRequestMetadata {
    */
   ttftMs?: number;
   /**
-   * Time from generateContent/generateContentStream entry to the start of the
-   * successful attempt (ms). Includes all failed retries + backoff sleeps.
-   * Populated by the retry layer in Phase 4b; undefined in Phase 4a.
+   * Time from `retryWithBackoff` entry to THIS attempt's start (ms). On a
+   * successful-attempt span this doubles as the total retry overhead before
+   * success. On a failed-attempt span this is the cumulative time elapsed in
+   * the retry budget at the moment this attempt fired (= attempts 1..N-1's
+   * durations + their backoff sleeps).
+   *
+   * Undefined when no retry context exists (direct calls bypassing
+   * retryWithBackoff: warmup, side-queries, etc.). Populated by the retry
+   * layer in Phase 4b via AsyncLocalStorage (`retryContext`).
    */
   requestSetupMs?: number;
   /**
-   * Final attempt number (1-based). 1 = no retries. Populated by the retry
-   * layer in Phase 4b; undefined in Phase 4a.
+   * 1-based monotonic attempt counter, populated by LoggingContentGenerator
+   * from `retryContext.getStore()`. Defaults to 1 when no retry context is
+   * present so dashboards filtering `WHERE attempt=1` include direct/warmup
+   * calls. Populated by Phase 4b retry layer for attempt >= 2.
    */
   attempt?: number;
   /**
-   * Sum of all backoff delays before the successful attempt (ms). 0 if no
-   * retries occurred. Populated by the retry layer in Phase 4b; undefined
-   * in Phase 4a.
+   * Sum of all backoff delays BEFORE this attempt started (ms). 0 for attempt 1.
+   * Undefined when no retry context exists. Populated by Phase 4b retry layer.
    */
   retryTotalDelayMs?: number;
 }
@@ -438,15 +445,21 @@ export function endLLMRequestSpan(
         endAttributes['retry_total_delay_ms'] = metadata.retryTotalDelayMs;
       }
       // Derived: sampling_ms = time from first user-visible chunk to end
-      // (== output generation time, excluding setup + first-token delay).
-      // Computable only when ttftMs is set. requestSetupMs defaults to 0
-      // when undefined (no retries) — this gives the correct sampling
-      // duration in both Phase 4a (no retry data) and Phase 4b (with).
+      // (== output generation time for THIS attempt).
+      //
+      // NOTE on Phase 4a bug fix: previous formula `duration - ttft - setup`
+      // double-counted the setup time. `duration_ms` is computed as
+      // `Date.now() - spanCtx.startTime`, and startTime is captured when
+      // `startLLMRequestSpan` runs — which is AFTER `requestSetupMs` worth of
+      // overhead has already passed. So the span's `duration_ms` only covers
+      // `ttft + sampling`, never the preceding setup. Subtracting `setup` again
+      // is wrong. In Phase 4a, `requestSetupMs` was always undefined so the
+      // bug was masked (0 subtraction). Phase 4b populates `requestSetupMs`
+      // with cumulative retry overhead, which would have clamped sampling_ms
+      // to 0 for every retried request — wiping out output-throughput data
+      // exactly when operators need it most. Fixed here.
       if (metadata.ttftMs !== undefined) {
-        const samplingMs = Math.max(
-          0,
-          duration - metadata.ttftMs - (metadata.requestSetupMs ?? 0),
-        );
+        const samplingMs = Math.max(0, duration - metadata.ttftMs);
         endAttributes['sampling_ms'] = samplingMs;
         // Derived: output tokens per second during sampling. Undefined when
         // sampling_ms is 0 (avoid divide-by-zero) or when outputTokens missing.
diff --git a/packages/core/src/telemetry/types.ts b/packages/core/src/telemetry/types.ts
index 866d0637dbe..fcb8ca8d017 100644
--- a/packages/core/src/telemetry/types.ts
+++ b/packages/core/src/telemetry/types.ts
@@ -646,6 +646,69 @@ export class ContentRetryEvent implements BaseTelemetryEvent {
   }
 }
 
+/**
+ * Phase 4b — HTTP-status retry telemetry. Emitted by `retryWithBackoff` (via
+ * the `onRetry` callback opt-in) for HTTP 429 / 5xx retries at LLM call sites.
+ *
+ * Distinct from {@link ContentRetryEvent}, which is emitted by `geminiChat`'s
+ * for-loop for `InvalidStreamError` retries that go through a SEPARATE retry
+ * budget (`INVALID_CONTENT_RETRY_OPTIONS`, NOT `retryWithBackoff`). A single
+ * user prompt may fire BOTH event types; sum across event types to count total
+ * retries per prompt_id.
+ */
+export class ApiRetryEvent implements BaseTelemetryEvent {
+  'event.name': 'api_retry';
+  'event.timestamp': string; // ISO 8601
+  model: string;
+  prompt_id?: string;
+  attempt_number: number; // 1-based monotonic counter (matches ALS retryContext.attempt)
+  error_type?: string;
+  error_message: string;
+  status_code?: number | string;
+  retry_delay_ms: number;
+  /**
+   * Reports the backoff delay following this failed attempt (NOT the attempt's
+   * own duration — that lives on the corresponding `qwen-code.llm_request`
+   * span's `duration_ms` attribute). Set equal to `retry_delay_ms` so the
+   * LogToSpanProcessor bridge span visualises the sleep window between the
+   * failed and next attempt in the trace timeline.
+   */
+  duration_ms: number;
+  /**
+   * Name of the subagent that issued the retrying request, or undefined when
+   * the request originates from the main conversation. Read from
+   * `subagentNameContext.getStore()` at the caller site (subagentNameContext
+   * is still active inside `retry.ts`'s catch block where `onRetry` fires).
+   */
+  subagent_name?: string;
+
+  constructor(opts: {
+    model: string;
+    promptId?: string;
+    attemptNumber: number;
+    error: unknown;
+    statusCode?: number | string;
+    retryDelayMs: number;
+    subagentName?: string;
+  }) {
+    this['event.name'] = 'api_retry';
+    this['event.timestamp'] = new Date().toISOString();
+    this.model = opts.model;
+    this.prompt_id = opts.promptId;
+    this.attempt_number = opts.attemptNumber;
+    this.error_message =
+      opts.error instanceof Error
+        ? opts.error.message
+        : String(opts.error ?? 'unknown error');
+    this.error_type =
+      opts.error instanceof Error ? opts.error.constructor.name : undefined;
+    this.status_code = opts.statusCode;
+    this.retry_delay_ms = opts.retryDelayMs;
+    this.duration_ms = opts.retryDelayMs;
+    this.subagent_name = opts.subagentName;
+  }
+}
+
 export class ContentRetryFailureEvent implements BaseTelemetryEvent {
   'event.name': 'content_retry_failure';
   'event.timestamp': string;
@@ -963,6 +1026,7 @@ export type TelemetryEvent =
   | InvalidChunkEvent
   | ContentRetryEvent
   | ContentRetryFailureEvent
+  | ApiRetryEvent
   | SubagentExecutionEvent
   | ExtensionEnableEvent
   | ExtensionInstallEvent
diff --git a/packages/core/src/utils/retry.test.ts b/packages/core/src/utils/retry.test.ts
index 0fd2478602a..cfcb13cc804 100644
--- a/packages/core/src/utils/retry.test.ts
+++ b/packages/core/src/utils/retry.test.ts
@@ -14,12 +14,13 @@ import {
   afterEach,
   afterAll,
 } from 'vitest';
-import type { HttpError } from './retry.js';
+import type { HttpError, RetryAttemptInfo } from './retry.js';
 import {
   retryWithBackoff,
   isTransientCapacityError,
   isUnattendedMode,
 } from './retry.js';
+import { retryContext } from './retryContext.js';
 import { getErrorStatus } from './errors.js';
 import { setSimulate429 } from './testUtils.js';
 import { AuthType } from '../core/contentGenerator.js';
@@ -1024,3 +1025,404 @@ describe('getErrorStatus', () => {
     expect(getErrorStatus(new Error('HTTP_STATUS/4291'))).toBeUndefined();
   });
 });
+
+// =========================================================================
+// Phase 4b — retry telemetry (ALS context + onRetry callback + monotonic counter)
+// =========================================================================
+describe('retryWithBackoff — Phase 4b retry context (ALS)', () => {
+  beforeEach(() => {
+    // Use fake timers consistently with the rest of this file — vitest's
+    // useRealTimers between describes is unreliable when other describes
+    // have stubbed timer globals. We advance via vi.runAllTimersAsync().
+    vi.useFakeTimers();
+    setSimulate429(false);
+    console.warn = vi.fn();
+  });
+
+  afterEach(() => {
+    vi.useRealTimers();
+    vi.restoreAllMocks();
+  });
+
+  it('sets retryContext.attempt monotonically across attempts', async () => {
+    const seenAttempts: number[] = [];
+    let attempts = 0;
+    const fn = vi.fn(async () => {
+      attempts++;
+      seenAttempts.push(retryContext.getStore()?.attempt ?? -1);
+      if (attempts <= 2) {
+        const err: HttpError = new Error('transient');
+        err.status = 500;
+        throw err;
+      }
+      return 'ok';
+    });
+
+    const promise = retryWithBackoff(fn, {
+      maxAttempts: 5,
+      initialDelayMs: 1,
+      maxDelayMs: 5,
+    });
+
+    await vi.runAllTimersAsync();
+    await expect(promise).resolves.toBe('ok');
+    expect(seenAttempts).toEqual([1, 2, 3]);
+  });
+
+  it('exposes retryContext.requestSetupMs / retryTotalDelayMs (== 0 for attempt 1, > 0 for retries)', async () => {
+    const snapshots: Array<{ setupMs: number; totalDelayMs: number }> = [];
+    let attempts = 0;
+    const fn = vi.fn(async () => {
+      attempts++;
+      const ctx = retryContext.getStore();
+      snapshots.push({
+        setupMs: ctx?.requestSetupMs ?? -1,
+        totalDelayMs: ctx?.retryTotalDelayMs ?? -1,
+      });
+      if (attempts <= 2) {
+        const err: HttpError = new Error('transient');
+        err.status = 500;
+        throw err;
+      }
+      return 'ok';
+    });
+
+    const promise = retryWithBackoff(fn, {
+      maxAttempts: 5,
+      initialDelayMs: 10,
+      maxDelayMs: 50,
+    });
+
+    await vi.runAllTimersAsync();
+    await promise;
+
+    // Attempt 1: nothing happened before, so both are 0.
+    expect(snapshots[0]!.setupMs).toBe(0);
+    expect(snapshots[0]!.totalDelayMs).toBe(0);
+    // Attempts 2+: both fields populate with positive values once retries
+    // have run. Exact values depend on the jittered backoff; assert monotonic.
+    expect(snapshots[1]!.setupMs).toBeGreaterThanOrEqual(0);
+    expect(snapshots[1]!.totalDelayMs).toBeGreaterThan(0);
+    expect(snapshots[2]!.setupMs).toBeGreaterThanOrEqual(snapshots[1]!.setupMs);
+    expect(snapshots[2]!.totalDelayMs).toBeGreaterThan(
+      snapshots[1]!.totalDelayMs,
+    );
+  });
+
+  it('first-try success: retryContext.attempt === 1, both delays === 0, onRetry never called', async () => {
+    let observed: { attempt: number; setup: number; delay: number } | null =
+      null;
+    const onRetry = vi.fn();
+    const fn = vi.fn(async () => {
+      const ctx = retryContext.getStore();
+      observed = {
+        attempt: ctx?.attempt ?? -1,
+        setup: ctx?.requestSetupMs ?? -1,
+        delay: ctx?.retryTotalDelayMs ?? -1,
+      };
+      return 'ok';
+    });
+
+    const promise = retryWithBackoff(fn, {
+      maxAttempts: 5,
+      initialDelayMs: 1,
+      maxDelayMs: 5,
+      onRetry,
+    });
+
+    await vi.runAllTimersAsync();
+    await promise;
+
+    expect(observed).toEqual({ attempt: 1, setup: 0, delay: 0 });
+    expect(onRetry).not.toHaveBeenCalled();
+  });
+
+  it('onRetry callback fires once per failed attempt with correct args', async () => {
+    const onRetry = vi.fn();
+    const fn = createFailingFunction(2, 'ok');
+
+    const promise = retryWithBackoff(fn, {
+      maxAttempts: 5,
+      initialDelayMs: 1,
+      maxDelayMs: 5,
+      onRetry,
+    });
+
+    await vi.runAllTimersAsync();
+    await promise;
+
+    // 2 failures -> 2 onRetry invocations
+    expect(onRetry).toHaveBeenCalledTimes(2);
+    const first = onRetry.mock.calls[0]![0] as RetryAttemptInfo;
+    expect(first.attempt).toBe(1);
+    expect(first.errorStatus).toBe(500);
+    expect((first.error as Error).message).toContain('attempt 1');
+    expect(first.delayMs).toBeGreaterThanOrEqual(0);
+    const second = onRetry.mock.calls[1]![0] as RetryAttemptInfo;
+    expect(second.attempt).toBe(2);
+  });
+
+  it('absence of onRetry is silent (no exception)', async () => {
+    const fn = createFailingFunction(1, 'ok');
+    // No onRetry passed. Must not throw or warn.
+    const promise = retryWithBackoff(fn, {
+      maxAttempts: 5,
+      initialDelayMs: 1,
+      maxDelayMs: 5,
+    });
+    await vi.runAllTimersAsync();
+    await expect(promise).resolves.toBe('ok');
+  });
+
+  it('onRetry callback throwing does NOT break the retry loop', async () => {
+    const onRetry = vi.fn(() => {
+      throw new Error('telemetry blew up');
+    });
+    const fn = createFailingFunction(2, 'ok');
+
+    const promise = retryWithBackoff(fn, {
+      maxAttempts: 5,
+      initialDelayMs: 1,
+      maxDelayMs: 5,
+      onRetry,
+    });
+
+    await vi.runAllTimersAsync();
+    await expect(promise).resolves.toBe('ok');
+    expect(onRetry).toHaveBeenCalledTimes(2);
+  });
+
+  it('shouldRetryOnError returns false mid-loop: onRetry not called for the giveup', async () => {
+    // Attempt 1 fails with 500 (retryable), attempt 2 fails with 400
+    // (non-retryable). Retry loop gives up on attempt 2 without invoking
+    // onRetry for it.
+    const onRetry = vi.fn();
+    let n = 0;
+    const fn = vi.fn(async () => {
+      n++;
+      const err: HttpError = new Error(`attempt ${n}`);
+      err.status = n === 1 ? 500 : 400;
+      throw err;
+    });
+
+    // Attach .catch() BEFORE the timer runs, so Vitest sees the promise has
+    // a handler when the rejection lands (avoids unhandled-rejection warnings).
+    const caught = retryWithBackoff(fn, {
+      maxAttempts: 5,
+      initialDelayMs: 1,
+      maxDelayMs: 5,
+      shouldRetryOnError: (e) =>
+        (e as HttpError).status === 500 || (e as HttpError).status === 429,
+      onRetry,
+    }).catch((e: unknown) => e);
+    await vi.runAllTimersAsync();
+    const error = await caught;
+    expect((error as Error).message).toBe('attempt 2');
+
+    // Only the FIRST failed attempt invoked onRetry (it led to a retry).
+    // The second failed attempt aborted the loop and did not.
+    expect(onRetry).toHaveBeenCalledTimes(1);
+    expect(onRetry.mock.calls[0]![0].attempt).toBe(1);
+  });
+
+  it('parallel retryWithBackoff calls maintain independent attempt counters', async () => {
+    // Two concurrent retryWithBackoff invocations must each see their own
+    // ALS context (AsyncLocalStorage isolates them by async chain).
+    const callA: number[] = [];
+    const callB: number[] = [];
+
+    const makeFn = (sink: number[]) => {
+      let n = 0;
+      return vi.fn(async () => {
+        n++;
+        sink.push(retryContext.getStore()?.attempt ?? -1);
+        if (n <= 1) {
+          const err: HttpError = new Error('boom');
+          err.status = 500;
+          throw err;
+        }
+        return 'ok';
+      });
+    };
+
+    const both = Promise.all([
+      retryWithBackoff(makeFn(callA), {
+        maxAttempts: 5,
+        initialDelayMs: 1,
+        maxDelayMs: 3,
+      }),
+      retryWithBackoff(makeFn(callB), {
+        maxAttempts: 5,
+        initialDelayMs: 1,
+        maxDelayMs: 3,
+      }),
+    ]);
+
+    await vi.runAllTimersAsync();
+    await both;
+
+    expect(callA).toEqual([1, 2]);
+    expect(callB).toEqual([1, 2]);
+  });
+
+  it('nested retryWithBackoff reads innermost frame', async () => {
+    const observed: Array<{
+      layer: 'outer' | 'inner';
+      attempt: number;
+    }> = [];
+    let innerAttempts = 0;
+
+    const inner = vi.fn(async () => {
+      innerAttempts++;
+      observed.push({
+        layer: 'inner',
+        attempt: retryContext.getStore()?.attempt ?? -1,
+      });
+      if (innerAttempts <= 1) {
+        const err: HttpError = new Error('inner-fail');
+        err.status = 500;
+        throw err;
+      }
+      return 'inner-ok';
+    });
+
+    const outer = vi.fn(async () => {
+      observed.push({
+        layer: 'outer',
+        attempt: retryContext.getStore()?.attempt ?? -1,
+      });
+      return await retryWithBackoff(inner, {
+        maxAttempts: 5,
+        initialDelayMs: 1,
+        maxDelayMs: 3,
+      });
+    });
+
+    const promise = retryWithBackoff(outer, {
+      maxAttempts: 1,
+      initialDelayMs: 1,
+      maxDelayMs: 3,
+    });
+
+    await vi.runAllTimersAsync();
+    await promise;
+
+    // Outer call sees its own frame's attempt (1).
+    // Inner calls see their own frame's attempt (1, then 2 after retry).
+    // Inner DOES NOT see the outer's frame.
+    expect(observed).toEqual([
+      { layer: 'outer', attempt: 1 },
+      { layer: 'inner', attempt: 1 },
+      { layer: 'inner', attempt: 2 },
+    ]);
+  });
+
+  it('persistent mode (status=429): onRetry fires with correct attempt + delayMs from persistent backoff', async () => {
+    // Review comment R1 #4 + R2 #3: the highest-volume production retry path
+    // (429 → persistent mode) was untested. Verify onRetry fires with the
+    // monotonic iterationCount and a reasonable backoff delay.
+    const onRetry = vi.fn();
+    let n = 0;
+    const fn = vi.fn(async () => {
+      n++;
+      if (n <= 2) {
+        const err: HttpError = new Error(`rate limited #${n}`);
+        err.status = 429;
+        throw err;
+      }
+      return 'ok';
+    });
+
+    const promise = retryWithBackoff(fn, {
+      maxAttempts: 5,
+      initialDelayMs: 50,
+      maxDelayMs: 200,
+      persistentMode: true,
+      onRetry,
+    });
+
+    await vi.runAllTimersAsync();
+    await expect(promise).resolves.toBe('ok');
+
+    expect(onRetry).toHaveBeenCalledTimes(2);
+    const first = onRetry.mock.calls[0]![0] as RetryAttemptInfo;
+    expect(first.attempt).toBe(1);
+    expect(first.errorStatus).toBe(429);
+    expect(first.delayMs).toBeGreaterThan(0);
+    const second = onRetry.mock.calls[1]![0] as RetryAttemptInfo;
+    expect(second.attempt).toBe(2);
+    expect(second.errorStatus).toBe(429);
+    // Persistent mode uses exponential backoff — second delay >= first
+    expect(second.delayMs).toBeGreaterThanOrEqual(first.delayMs);
+  });
+
+  it('normal retry with Retry-After header: onRetry receives the header-derived delayMs', async () => {
+    // Review comment R2 #7: verify that when the error includes a
+    // `retry-after` header, `onRetry.delayMs` reflects the parsed value
+    // (not the exponential backoff calculation).
+    const onRetry = vi.fn();
+    let n = 0;
+    const fn = vi.fn(async () => {
+      n++;
+      if (n <= 1) {
+        const err = new Error('rate limited') as HttpError & {
+          response?: { headers?: { 'retry-after'?: string } };
+        };
+        err.status = 429;
+        err.response = { headers: { 'retry-after': '2' } }; // 2 seconds
+        throw err;
+      }
+      return 'ok';
+    });
+
+    const promise = retryWithBackoff(fn, {
+      maxAttempts: 5,
+      initialDelayMs: 100,
+      maxDelayMs: 500,
+      onRetry,
+    });
+
+    await vi.runAllTimersAsync();
+    await expect(promise).resolves.toBe('ok');
+
+    expect(onRetry).toHaveBeenCalledTimes(1);
+    const info = onRetry.mock.calls[0]![0] as RetryAttemptInfo;
+    // Retry-After: 2 → 2000ms
+    expect(info.delayMs).toBe(2000);
+    expect(info.errorStatus).toBe(429);
+  });
+
+  it('signal.aborted before onRetry: no phantom retry event emitted', async () => {
+    // Review comment R2 #6: when signal fires between catch and onRetry,
+    // the guard `if (!signal?.aborted)` should prevent onRetry from firing.
+    const onRetry = vi.fn();
+    const controller = new AbortController();
+    let n = 0;
+    const fn = vi.fn(async () => {
+      n++;
+      if (n === 1) {
+        // Abort the signal during the first failure — before onRetry runs
+        controller.abort();
+        const err: HttpError = new Error('server error');
+        err.status = 500;
+        throw err;
+      }
+      return 'ok';
+    });
+
+    // The retry loop should detect the aborted signal and NOT fire onRetry.
+    const promise = retryWithBackoff(fn, {
+      maxAttempts: 5,
+      initialDelayMs: 10,
+      maxDelayMs: 50,
+      signal: controller.signal,
+      onRetry,
+    }).catch((e: unknown) => e);
+
+    await vi.runAllTimersAsync();
+    await promise;
+
+    // onRetry should NOT have been called because signal was aborted
+    expect(onRetry).not.toHaveBeenCalled();
+  });
+});
diff --git a/packages/core/src/utils/retry.ts b/packages/core/src/utils/retry.ts
index 54b0b6db9e0..12bed9e6826 100644
--- a/packages/core/src/utils/retry.ts
+++ b/packages/core/src/utils/retry.ts
@@ -9,6 +9,7 @@ import { AuthType } from '../core/contentGenerator.js';
 import { isQwenQuotaExceededError } from './quotaErrorDetection.js';
 import { createDebugLogger } from './debugLogger.js';
 import { getErrorStatus } from './errors.js';
+import { retryContext } from './retryContext.js';
 
 const debugLogger = createDebugLogger('RETRY');
 
@@ -27,6 +28,22 @@ export interface HeartbeatInfo {
   error: unknown;
 }
 
+/**
+ * Information passed to `RetryOptions.onRetry` after each failed attempt.
+ * Lets callers (LLM call sites) emit `ApiRetryEvent` telemetry without
+ * coupling `retry.ts` to telemetry concerns.
+ */
+export interface RetryAttemptInfo {
+  /**
+   * 1-based monotonic iteration counter — same value as ALS context's `attempt`.
+   */
+  attempt: number;
+  error: unknown;
+  errorStatus?: number;
+  /** Computed backoff delay that follows this failed attempt (ms). */
+  delayMs: number;
+}
+
 export interface RetryOptions {
   maxAttempts: number;
   initialDelayMs: number;
@@ -41,6 +58,25 @@ export interface RetryOptions {
   heartbeatIntervalMs?: number;
   heartbeatFn?: (info: HeartbeatInfo) => void;
   signal?: AbortSignal;
+  /**
+   * Optional. Called once per failed attempt after the backoff delay is
+   * computed but BEFORE the sleep. Use this to emit retry telemetry events
+   * (e.g. `ApiRetryEvent` for LLM call sites); leave undefined for non-LLM
+   * callers so they stay silent in LLM-specific telemetry channels.
+   *
+   * Contract:
+   * - Invoked only after `await fn()` rejects in the catch block of
+   *   `retryWithBackoff` (OUTSIDE the `retryContext.run()` ALS frame).
+   *   This is true for both synchronous and asynchronous throws from `fn`.
+   *   All retry-context data is passed via the `RetryAttemptInfo` parameter
+   *   — do NOT read `retryContext.getStore()` inside an `onRetry` callback.
+   * - Content-retries via `shouldRetryOnContent` do NOT fire `onRetry`.
+   *   If a future caller wires content retries, extend `retry.ts` to fire
+   *   `onRetry` on that path too.
+   * - Callback errors are swallowed and logged via `debugLogger.warn`; they
+   *   never affect retry behavior (best-effort telemetry).
+   */
+  onRetry?: (info: RetryAttemptInfo) => void;
 }
 
 const DEFAULT_RETRY_OPTIONS: RetryOptions = {
@@ -162,6 +198,7 @@ export async function retryWithBackoff<T>(
     heartbeatIntervalMs,
     heartbeatFn,
     signal,
+    onRetry,
   } = {
     ...DEFAULT_RETRY_OPTIONS,
     ...cleanOptions,
@@ -176,10 +213,24 @@ export async function retryWithBackoff<T>(
   let persistentAttempt = 0;
   let currentDelay = initialDelayMs;
 
+  // Phase 4b — retry telemetry context. `iterationCount` is the monotonic
+  // counter that always reflects "this is the Nth time fn was called",
+  // regardless of normal vs persistent retry mode. Decoupled from the
+  // `attempt` variable above which is clamped at `maxAttempts - 1` in
+  // persistent mode to keep the while-loop alive.
+  const requestEntryTime = Date.now();
+  let iterationCount = 0;
+  let retryTotalDelayMs = 0;
+
   while (attempt < maxAttempts) {
     attempt++;
+    iterationCount++;
+    const requestSetupMs = Date.now() - requestEntryTime;
     try {
-      const result = await fn();
+      const result = await retryContext.run(
+        { attempt: iterationCount, retryTotalDelayMs, requestSetupMs },
+        () => fn(),
+      );
 
       if (
         shouldRetryOnContent &&
@@ -188,6 +239,12 @@ export async function retryWithBackoff<T>(
         const jitter = currentDelay * 0.3 * (Math.random() * 2 - 1);
         const delayWithJitter = Math.max(0, currentDelay + jitter);
         await delay(delayWithJitter);
+        // Note: this inflates retryTotalDelayMs beyond what onRetry/ApiRetryEvent
+        // reports — content-retry delays are invisible in the api_retry telemetry
+        // channel (onRetry only fires from the catch-block error path). The LLM
+        // span's retry_total_delay_ms attribute includes ALL delays (content +
+        // error), which is the accurate "total time the user waited in backoff."
+        retryTotalDelayMs += delayWithJitter;
         currentDelay = Math.min(maxDelayMs, currentDelay * 2);
         continue;
       }
@@ -256,6 +313,27 @@ export async function retryWithBackoff<T>(
           error,
         );
 
+        // Phase 4b — fire onRetry telemetry callback BEFORE sleep, so
+        // operators see retry events live. Guard with signal?.aborted so we
+        // don't emit a phantom retry event for an attempt that will never
+        // actually proceed (signal fires during the previous sleep or between
+        // catch and this point). Wrap in try/catch: a logging failure must
+        // NEVER break the retry loop.
+        if (!signal?.aborted) {
+          try {
+            onRetry?.({
+              attempt: iterationCount,
+              error,
+              errorStatus,
+              delayMs,
+            });
+          } catch (cbError) {
+            debugLogger.warn(
+              `onRetry callback threw (swallowed): ${cbError instanceof Error ? cbError.message : String(cbError)}`,
+            );
+          }
+        }
+
         // Heartbeat sleep — chunked to keep CI alive
         await sleepWithHeartbeat(delayMs, {
           attempt: reportedAttempt,
@@ -264,30 +342,53 @@ export async function retryWithBackoff<T>(
           heartbeatFn,
           signal,
         });
+        retryTotalDelayMs += delayMs;
 
         // Clamp attempt so the while-loop never exits
         if (attempt >= maxAttempts) {
           attempt = maxAttempts - 1;
         }
       } else {
-        // Normal retry path (unchanged behavior)
+        // Normal retry path
         const retryAfterMs =
           errorStatus === 429 ? getRetryAfterDelayMs(error) : 0;
 
+        let actualDelayMs: number;
         if (retryAfterMs > 0) {
           debugLogger.warn(
             `Attempt ${attempt} failed with status ${errorStatus ?? 'unknown'}. Retrying after explicit delay of ${retryAfterMs}ms...`,
             error,
           );
-          await delay(retryAfterMs);
+          actualDelayMs = retryAfterMs;
           currentDelay = initialDelayMs;
         } else {
           logRetryAttempt(attempt, error, errorStatus);
           const jitter = currentDelay * 0.3 * (Math.random() * 2 - 1);
-          const delayWithJitter = Math.max(0, currentDelay + jitter);
-          await delay(delayWithJitter);
+          actualDelayMs = Math.max(0, currentDelay + jitter);
           currentDelay = Math.min(maxDelayMs, currentDelay * 2);
         }
+
+        // Phase 4b — fire onRetry telemetry callback BEFORE sleep. Guard
+        // with signal?.aborted to avoid phantom events when abort fires
+        // between catch and here. Wrapped in try/catch so a logging failure
+        // cannot break the retry loop.
+        if (!signal?.aborted) {
+          try {
+            onRetry?.({
+              attempt: iterationCount,
+              error,
+              errorStatus,
+              delayMs: actualDelayMs,
+            });
+          } catch (cbError) {
+            debugLogger.warn(
+              `onRetry callback threw (swallowed): ${cbError instanceof Error ? cbError.message : String(cbError)}`,
+            );
+          }
+        }
+
+        await delay(actualDelayMs);
+        retryTotalDelayMs += actualDelayMs;
       }
     }
   }
diff --git a/packages/core/src/utils/retryContext.ts b/packages/core/src/utils/retryContext.ts
new file mode 100644
index 00000000000..fac90c8758a
--- /dev/null
+++ b/packages/core/src/utils/retryContext.ts
@@ -0,0 +1,40 @@
+/**
+ * @license
+ * Copyright 2026 Qwen Team
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { AsyncLocalStorage } from 'node:async_hooks';
+
+/**
+ * Per-attempt retry context propagated through AsyncLocalStorage from
+ * `retryWithBackoff` down to `LoggingContentGenerator`. Lets each per-attempt
+ * `qwen-code.llm_request` span carry meaningful `attempt` / `request_setup_ms`
+ * / `retry_total_delay_ms` attributes without changing the LLM API surface.
+ *
+ * See docs/design/telemetry-llm-request-timing-design.md (Phase 4b, D4).
+ */
+export interface RetryAttemptContext {
+  /**
+   * 1-based monotonic iteration counter for the current `retryWithBackoff`
+   * execution. Always reflects "this is the Nth time fn was called",
+   * regardless of normal vs persistent retry mode. Unaffected by the
+   * `attempt = maxAttempts - 1` clamping that keeps the persistent-mode loop
+   * alive.
+   */
+  readonly attempt: number;
+  /**
+   * Sum of all backoff delays BEFORE this attempt started (ms). 0 for attempt 1.
+   * Accumulates across the retry loop.
+   */
+  readonly retryTotalDelayMs: number;
+  /**
+   * Time from `retryWithBackoff` entry to THIS attempt's start (ms). 0 for
+   * attempt 1 of a no-retry success. For attempt N>1, equals cumulative time
+   * spent in attempts 1..N-1 plus their backoff sleeps. Computed in `retry.ts`
+   * immediately before `await fn()` to avoid measurement drift across layers.
+   */
+  readonly requestSetupMs: number;
+}
+
+export const retryContext = new AsyncLocalStorage<RetryAttemptContext>();