-
Notifications
You must be signed in to change notification settings - Fork 410
Harden awf-reflect startup against api-proxy warmup 503s on /v1/models #34265
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ee6d784
1776b0b
aee30d8
bf57424
225cf6d
3f2ae34
7a1c875
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,8 +15,11 @@ | |
|
|
||
| "use strict"; | ||
|
|
||
| require("./shim.cjs"); | ||
|
|
||
| const fs = require("fs"); | ||
| const path = require("path"); | ||
| const { withRetry } = require("./error_recovery.cjs"); | ||
|
|
||
| // AWF API proxy management endpoint for discovering configured LLM providers and available models. | ||
| // The api-proxy sidecar exposes /reflect on its management port (port 10000) inside the AWF | ||
|
|
@@ -29,6 +32,12 @@ const AWF_REFLECT_OUTPUT_PATH = "/tmp/gh-aw/sandbox/firewall/awf-reflect.json"; | |
| const AWF_REFLECT_TIMEOUT_MS = 60000; | ||
| // Milliseconds to wait for each models_url fallback fetch (shorter than the main reflect timeout). | ||
| const AWF_MODELS_URL_TIMEOUT_MS = 3000; | ||
| // Maximum attempts for models_url fallback fetches when the proxy is not yet ready. | ||
| const AWF_MODELS_URL_MAX_ATTEMPTS = 5; | ||
| // Base delay between models_url fallback retries. Uses exponential backoff. | ||
| const AWF_MODELS_URL_RETRY_BASE_MS = 250; | ||
| // Cap for exponential backoff delay between retries. | ||
| const AWF_MODELS_URL_RETRY_MAX_MS = 2000; | ||
| // Gemini model name prefix stripped from model IDs in the Gemini models API response. | ||
| // Example: { name: "models/gemini-1.5-pro" } → "gemini-1.5-pro" | ||
| const GEMINI_MODEL_NAME_PREFIX = "models/"; | ||
|
|
@@ -84,32 +93,79 @@ function extractModelIds(json) { | |
| * @returns {Promise<string[]|null>} | ||
| */ | ||
| async function fetchModelsFromUrl(modelsUrl, timeoutMs, logger) { | ||
| const ac = new AbortController(); | ||
| const timer = setTimeout(() => { | ||
| logger(`awf-reflect: models fetch timed out for ${modelsUrl}`); | ||
| ac.abort(); | ||
| }, timeoutMs); | ||
| let attemptCounter = 0; | ||
| const retryConfig = { | ||
| maxRetries: AWF_MODELS_URL_MAX_ATTEMPTS - 1, | ||
| // withRetry multiplies delay before the next attempt, so divide by 2 here | ||
| // to preserve the intended first backoff of AWF_MODELS_URL_RETRY_BASE_MS. | ||
| initialDelayMs: Math.ceil(AWF_MODELS_URL_RETRY_BASE_MS / 2), | ||
| maxDelayMs: AWF_MODELS_URL_RETRY_MAX_MS, | ||
| backoffMultiplier: 2, | ||
| jitterMs: 0, | ||
| shouldRetry: error => { | ||
| const original = error?.originalError || error; | ||
| const status = original?.status ?? original?.response?.status ?? null; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [/diagnose] The 💡 SuggestionEither remove the guard (the log fires on every 503, including the final one), or add a clarifying comment: // Log on every 503; withRetry itself stops after maxRetries (MAX_ATTEMPTS - 1 retries).
if (shouldRetry) {
logger(`awf-reflect: models fetch returned 503 for ${modelsUrl}; retrying (attempt ${attemptCounter + 1}/${AWF_MODELS_URL_MAX_ATTEMPTS})`);
}This also means the last 503 won't log a "retrying" line even though |
||
| const shouldRetry = status === 503; | ||
| if (shouldRetry && attemptCounter < AWF_MODELS_URL_MAX_ATTEMPTS) { | ||
| logger(`awf-reflect: models fetch returned 503 for ${modelsUrl}; retrying (attempt ${attemptCounter + 1}/${AWF_MODELS_URL_MAX_ATTEMPTS})`); | ||
| } | ||
| return shouldRetry; | ||
| }, | ||
| }; | ||
|
|
||
| try { | ||
| const res = await fetch(modelsUrl, { signal: ac.signal }); | ||
| if (!res.ok) { | ||
| logger(`awf-reflect: models fetch returned ${res.status} for ${modelsUrl}`); | ||
| return null; | ||
| } | ||
| const json = await res.json(); | ||
| const models = extractModelIds(json); | ||
| if (models) { | ||
| logger(`awf-reflect: fetched ${models.length} model(s) from ${modelsUrl}`); | ||
| } | ||
| return models; | ||
| return await withRetry( | ||
| async () => { | ||
| attemptCounter += 1; | ||
| const ac = new AbortController(); | ||
| const timer = setTimeout(() => { | ||
| logger(`awf-reflect: models fetch timed out for ${modelsUrl}`); | ||
| ac.abort(); | ||
| }, timeoutMs); | ||
| try { | ||
| const res = await fetch(modelsUrl, { signal: ac.signal }); | ||
| if (!res.ok) { | ||
| if (res.status === 503) { | ||
| const err = Object.assign(new Error(`models fetch returned 503 for ${modelsUrl}`), { status: 503 }); | ||
| throw err; | ||
| } | ||
| logger(`awf-reflect: models fetch returned ${res.status} for ${modelsUrl}`); | ||
| return null; | ||
| } | ||
| const json = await res.json(); | ||
| const models = extractModelIds(json); | ||
| if (models) { | ||
| logger(`awf-reflect: fetched ${models.length} model(s) from ${modelsUrl}`); | ||
| } | ||
| return models; | ||
| } catch (err) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [/diagnose] A timeout ( 💡 ContextIn the original code, Now, if attempt #2 times out but the server would have responded on attempt #3, the retry loop stops early. This is probably acceptable (keep pre-flight opportunistic), but the behavior difference is non-obvious and a comment would prevent a future reader from "fixing" it. // Timeout: treat as a successful null (not an error) so withRetry
// doesn't count this as a retryable failure — pre-flight capture is best-effort.
if (e.name === "AbortError") {
return null;
} |
||
| const e = /** @type {Error} */ err; | ||
| if (e.name === "AbortError") { | ||
| return null; // already logged above | ||
| } | ||
| const status = e?.status ?? e?.response?.status ?? null; | ||
| if (status === 503) { | ||
| throw e; | ||
| } | ||
| logger(`awf-reflect: models fetch error for ${modelsUrl}: ${e.message}`); | ||
| return null; | ||
| } finally { | ||
| clearTimeout(timer); | ||
| } | ||
| }, | ||
| retryConfig, | ||
| `awf-reflect models fetch for ${modelsUrl}` | ||
| ); | ||
| } catch (err) { | ||
| const e = /** @type {Error} */ err; | ||
| if (e.name === "AbortError") { | ||
| return null; // already logged above | ||
| const original = e?.originalError || e; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [/tdd] This outer 💡 SuggestionAdd a test that verifies what
Dead catch branches are maintenance traps: they'll never fail tests if the behavior changes, but they'll mislead readers into thinking there's a safety net. |
||
| const status = original?.status ?? original?.response?.status ?? null; | ||
| if (status === 503) { | ||
| logger(`awf-reflect: models fetch returned 503 for ${modelsUrl}`); | ||
| return null; | ||
| } | ||
| logger(`awf-reflect: models fetch error for ${modelsUrl}: ${e.message}`); | ||
| return null; | ||
| } finally { | ||
| clearTimeout(timer); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -250,6 +306,9 @@ if (typeof module !== "undefined" && module.exports) { | |
| AWF_REFLECT_OUTPUT_PATH, | ||
| AWF_REFLECT_TIMEOUT_MS, | ||
| AWF_MODELS_URL_TIMEOUT_MS, | ||
| AWF_MODELS_URL_MAX_ATTEMPTS, | ||
| AWF_MODELS_URL_RETRY_BASE_MS, | ||
| AWF_MODELS_URL_RETRY_MAX_MS, | ||
| GEMINI_MODEL_NAME_PREFIX, | ||
| enrichReflectModels, | ||
| extractModelIds, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,9 @@ const { | |
| AWF_REFLECT_OUTPUT_PATH, | ||
| AWF_REFLECT_TIMEOUT_MS, | ||
| AWF_MODELS_URL_TIMEOUT_MS, | ||
| AWF_MODELS_URL_MAX_ATTEMPTS, | ||
| AWF_MODELS_URL_RETRY_BASE_MS, | ||
| AWF_MODELS_URL_RETRY_MAX_MS, | ||
| GEMINI_MODEL_NAME_PREFIX, | ||
| enrichReflectModels, | ||
| extractModelIds, | ||
|
|
@@ -24,6 +27,9 @@ describe("awf_reflect.cjs", () => { | |
| expect(AWF_REFLECT_OUTPUT_PATH).toBe("/tmp/gh-aw/sandbox/firewall/awf-reflect.json"); | ||
| expect(AWF_REFLECT_TIMEOUT_MS).toBe(60000); | ||
| expect(AWF_MODELS_URL_TIMEOUT_MS).toBe(3000); | ||
| expect(AWF_MODELS_URL_MAX_ATTEMPTS).toBe(5); | ||
| expect(AWF_MODELS_URL_RETRY_BASE_MS).toBe(250); | ||
| expect(AWF_MODELS_URL_RETRY_MAX_MS).toBe(2000); | ||
| expect(GEMINI_MODEL_NAME_PREFIX).toBe("models/"); | ||
| }); | ||
| }); | ||
|
|
@@ -171,6 +177,33 @@ describe("awf_reflect.cjs", () => { | |
| expect(result).toBeNull(); | ||
| expect(logs.some(l => l.includes("models fetch error"))).toBe(true); | ||
| }); | ||
|
|
||
| it("retries on 503 and eventually succeeds", async () => { | ||
| vi.stubGlobal( | ||
| "fetch", | ||
| vi | ||
| .fn() | ||
| .mockResolvedValueOnce({ ok: false, status: 503 }) | ||
| .mockResolvedValueOnce({ ok: false, status: 503 }) | ||
| .mockResolvedValueOnce({ ok: true, status: 200, json: async () => ({ data: [{ id: "gpt-4o" }] }) }) | ||
| ); | ||
|
|
||
| const logs = []; | ||
| const result = await fetchModelsFromUrl("http://api-proxy:10000/v1/models", 1000, msg => logs.push(msg)); | ||
| expect(result).toEqual(["gpt-4o"]); | ||
| expect(logs.filter(l => l.includes("retrying (attempt")).length).toBe(2); | ||
| expect(logs.some(l => l.includes("fetched 1 model(s)"))).toBe(true); | ||
| }); | ||
|
|
||
| it("stops retrying after max attempts on repeated 503 responses", async () => { | ||
| vi.stubGlobal("fetch", vi.fn().mockResolvedValue({ ok: false, status: 503 })); | ||
|
|
||
| const logs = []; | ||
| const result = await fetchModelsFromUrl("http://api-proxy:10000/v1/models", 1000, msg => logs.push(msg)); | ||
| expect(result).toBeNull(); | ||
| expect(logs.filter(l => l.includes("retrying (attempt")).length).toBe(AWF_MODELS_URL_MAX_ATTEMPTS - 1); | ||
| expect(logs.some(l => l.includes("models fetch returned 503"))).toBe(true); | ||
| }); | ||
|
Comment on lines
+181
to
+206
|
||
| }); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [/tdd] The exhaustion test verifies log messages and null return but doesn't assert that 💡 Suggested additionconst mockFetch = vi.fn().mockResolvedValue({ ok: false, status: 503 });
vi.stubGlobal("fetch", mockFetch);
// ... run ...
expect(mockFetch).toHaveBeenCalledTimes(AWF_MODELS_URL_MAX_ATTEMPTS);This pins the contract: exactly 5 attempts (1 initial + 4 retries), no more, no less. Without it, an off-by-one in |
||
|
|
||
| describe("fetchAWFReflect", () => { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[/diagnose]
initialDelayMs / 2leakswithRetry's internal multiply-before-delay contract into the caller — ifwithRetrychanges when it applies the multiplier, the effective first delay will silently double or halve.💡 Suggestion
Consider one of:
error_recovery.test.cjsthat pinswithRetry's first-delay behavior so a change there fails loudly before it reaches callers.AWF_MODELS_URL_RETRY_BASE_MSand documentwithRetry's contract in its own JSDoc instead of compensating at call-site.Right now any refactor of
withRetry's backoff scheduling breaks this silently.