feat: Add AI response timeout configuration and SSE stream error handling

JGoutin · JGoutin · commit a81f9ee326b9 · 2026-03-03T22:40:16.000+01:00
diff --git a/docs/operations_configuration.md b/docs/operations_configuration.md
@@ -235,6 +235,7 @@ Choose **one** method (mutually exclusive):
 | [`TOKENS_ESTIMATION_DEFAULT_ENCODING`](#tokens-encoding)            | `o200k_base`            | Tiktoken encoding algorithm: `o200k_base` (GPT-4o+), `cl100k_base` (GPT-4), or `p50k_base` |
 | [`DEFAULT_MODEL_PARAMS`](#default-model-params)                     | `{}`                    | JSON object with per-model default inference parameters (temperature, max_tokens, etc.)    |
 | [`MODEL_CACHE_SECONDS`](#model-cache-seconds)                       | `900`                   | Model list cache lifetime in seconds before lazy refresh (default: 15 minutes)             |
+| [`AI_RESPONSE_TIMEOUT`](#ai-response-timeout)                       | `600`                   | Maximum seconds to wait for a model to complete a response (default: 10 minutes)           |
 | [`DROP_UNSUPPORTED_SYSTEM_PROMPT`](#drop-unsupported-system-prompt) | `true`                  | Drop system prompts for unsupported models; when `false`, return error instead             |
 | [`ANTHROPIC_BETA_FILTER`](#anthropic-beta-filter)                   | `true`                  | Enable filtering of unsupported `anthropic_beta` flags for Claude models                   |
 | [`ANTHROPIC_BETA_ALLOWLIST`](#anthropic-beta-allowlist)             | `(empty)`               | Additional `anthropic_beta` flags to allow beyond built-in Bedrock defaults                |
@@ -2895,6 +2896,38 @@ export MODEL_CACHE_SECONDS=3600
     - **Rate Limits**: Very frequent refreshes in high-traffic deployments may approach API rate limits, though parallel execution doesn't increase per-region request rate
     - **Multi-Region**: Refresh latency is determined by the slowest responding region, not the total number of regions, thanks to parallel execution
 
+#### `AI_RESPONSE_TIMEOUT` { #ai-response-timeout }
+
+:octicons-package-24: **Purpose**
+:   Maximum time in seconds to wait for an AI model to complete a response
+
+:octicons-database-24: **Type**
+:   Integer (seconds, must be greater than 0)
+
+:octicons-gear-24: **Default**
+:   `600` (10 minutes)
+
+:octicons-workflow-24: **Behavior**
+:   Applies to both streaming and non-streaming requests. The timer starts from the moment the model begins generating and covers the full duration until the last token is received. If the model does not complete within this limit, the connection is closed and the request fails with a timeout error
+
+```bash
+# Default (10 minutes) - suitable for extended thinking models
+export AI_RESPONSE_TIMEOUT=600
+
+# Shorter timeout for standard models (2 minutes)
+export AI_RESPONSE_TIMEOUT=120
+
+# Longer timeout for very long documents or high reasoning budgets (15 minutes)
+export AI_RESPONSE_TIMEOUT=900
+```
+
+!!! tip "When to Adjust"
+    - **Increase** if you see timeout errors with models that use extended thinking/reasoning, large document analysis, or high token budgets
+    - **Decrease** to fail fast and free resources if your workload only uses standard models where long waits indicate a problem
+
+!!! info "Extended Thinking Models"
+    Models with extended reasoning capabilities (such as Claude with `thinking` enabled or high `reasoning_effort`) may spend significant time generating internal reasoning steps before producing output. The default of 600 seconds accommodates these use cases. Standard models without extended thinking typically respond within 60 seconds.
+
 ---
 
 ## Default Model Parameters
diff --git a/stdapi/aws.py b/stdapi/aws.py
@@ -28,6 +28,7 @@
     retries=_RETRIES,
     max_pool_connections=_MAX_POOL_CONNECTIONS,
     parameter_validation=False,
+    read_timeout=SETTINGS.ai_response_timeout,
 )
 
 getLogger("aiobotocore").setLevel("CRITICAL")
diff --git a/stdapi/aws_bedrock.py b/stdapi/aws_bedrock.py
@@ -366,6 +366,41 @@ def get_extra_model_parameters(
     return params
 
 
+#: AWS error codes to HTTP status + error type mapping
+AWS_ERROR_MAP: dict[str, tuple[int, str]] = {
+    **dict.fromkeys(
+        {
+            "ThrottlingException",
+            "TooManyRequestsException",
+            "ServiceQuotaExceededException",
+        },
+        (429, "rate_limit_error"),
+    ),
+    **dict.fromkeys({"AccessDeniedException"}, (403, "permission_error")),
+    **dict.fromkeys(
+        {
+            "UnrecognizedClientException",
+            "InvalidSignatureException",
+            "ExpiredTokenException",
+        },
+        (401, "authentication_error"),
+    ),
+    **dict.fromkeys({"ResourceNotFoundException"}, (404, "not_found_error")),
+    **dict.fromkeys(
+        {"ValidationException", "BadRequestException"}, (400, "invalid_request_error")
+    ),
+    **dict.fromkeys(
+        {
+            "ServiceUnavailableException",
+            "InternalServerException",
+            "ServiceFailureException",
+            "ReadTimeoutError",
+        },
+        (503, "server_error"),
+    ),
+}
+
+
 @contextmanager
 def handle_bedrock_client_error() -> Generator[None]:
     """Context manager to translate Bedrock client errors to appropriate HTTP 4XX/5XX when possible.
diff --git a/stdapi/config.py b/stdapi/config.py
@@ -671,6 +671,24 @@ class _Settings(BaseSettings):
         ),
     )
 
+    ai_response_timeout: int = Field(
+        default=600,
+        gt=0,
+        description=(
+            "Maximum time in seconds to wait for an AI model to complete a response. "
+            "This applies to both streaming and non-streaming requests, from the moment "
+            "the model starts generating until the last token is received.\n\n"
+            "The default of 600 seconds (10 minutes) accommodates models with extended "
+            "reasoning or thinking capabilities, which may take longer to generate "
+            "complex responses. For standard models without extended thinking, responses "
+            "typically complete well within 60 seconds.\n\n"
+            "Increase this value if you experience timeout errors with long-running "
+            "requests (e.g., large document analysis, complex reasoning tasks). "
+            "Decrease it to fail fast on unexpectedly slow responses.\n\n"
+            "Example: 300 (5 minutes), 600 (10 minutes, default), 900 (15 minutes)"
+        ),
+    )
+
     model_cache_seconds: int = Field(
         default=900,
         description=(
diff --git a/stdapi/main.py b/stdapi/main.py
@@ -20,6 +20,7 @@
 from stdapi.auth import initialize_authentication
 from stdapi.aws import AWSConnectionManager, initialize_aws_account_info
 from stdapi.aws_bedrock import (
+    AWS_ERROR_MAP,
     set_guardrail_configuration,
     set_performance_configuration,
 )
@@ -311,40 +312,6 @@ async def handle_validation_exception(
     )
 
 
-#: AWS error codes to OpenAI error codes
-_AWS_ERROR_MAP: dict[str, tuple[int, str]] = {
-    **dict.fromkeys(
-        {
-            "ThrottlingException",
-            "TooManyRequestsException",
-            "ServiceQuotaExceededException",
-        },
-        (429, "rate_limit_error"),
-    ),
-    **dict.fromkeys({"AccessDeniedException"}, (403, "permission_error")),
-    **dict.fromkeys(
-        {
-            "UnrecognizedClientException",
-            "InvalidSignatureException",
-            "ExpiredTokenException",
-        },
-        (401, "authentication_error"),
-    ),
-    **dict.fromkeys({"ResourceNotFoundException"}, (404, "not_found_error")),
-    **dict.fromkeys(
-        {"ValidationException", "BadRequestException"}, (400, "invalid_request_error")
-    ),
-    **dict.fromkeys(
-        {
-            "ServiceUnavailableException",
-            "InternalServerException",
-            "ServiceFailureException",
-        },
-        (503, "server_error"),
-    ),
-}
-
-
 @app.exception_handler(ClientError)
 async def handle_botocore_client_error(
     request: Request, exc: ClientError
@@ -362,7 +329,7 @@ async def handle_botocore_client_error(
     """
     error = exc.response["Error"]
     aws_code = error["Code"]
-    status, err_type = _AWS_ERROR_MAP.get(aws_code, (502, "server_error"))
+    status, err_type = AWS_ERROR_MAP.get(aws_code, (502, "server_error"))
     log_error_details(error["Message"], status=status)
     return JSONResponse(
         *format_http_error(
diff --git a/stdapi/models/chat/_default.py b/stdapi/models/chat/_default.py
@@ -25,7 +25,7 @@
 from stdapi.models.chat._adapters import _openai_chat_completion as openai_adapter
 from stdapi.monitoring import (
     REQUEST_HEADERS,
-    log_request_stream_event,
+    log_request_sse_stream_event,
     log_response_params,
 )
 from stdapi.types.anthropic_messages import ToolChoiceToolParam
@@ -158,7 +158,7 @@ async def create_completion(
         )
         if request.stream:
             return EventSourceResponse(
-                await log_request_stream_event(
+                log_request_sse_stream_event(
                     openai_adapter.format_stream(
                         completion_id,
                         created,
@@ -267,7 +267,7 @@ async def create_message(
                     await bedrock_runtime.converse_stream(**bedrock_request)
                 )["stream"]
             return EventSourceResponse(
-                await log_request_stream_event(
+                log_request_sse_stream_event(
                     anthropic_adapter.format_stream(
                         message_id, request.model, bedrock_stream, forced_tool
                     )
diff --git a/stdapi/monitoring.py b/stdapi/monitoring.py
@@ -6,17 +6,22 @@
 from traceback import format_exception
 from typing import TYPE_CHECKING, Any, Literal, NotRequired, TypedDict, TypeVar
 
+from botocore.exceptions import ClientError
+from fastapi import Request  # noqa: TC002
 from pydantic import AwareDatetime, BaseModel, JsonValue
+from sse_starlette import JSONServerSentEvent
 
+from stdapi.api_errors import ApiError
+from stdapi.api_providers import format_http_error
+from stdapi.aws_bedrock import AWS_ERROR_MAP
 from stdapi.config import SETTINGS, LogLevel
 from stdapi.metering import SERVER_FULL_VERSION
 from stdapi.server import SERVER_NAME
-from stdapi.utils import stdout_write, webuuid
+from stdapi.utils import hide_security_details, stdout_write, webuuid
 
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator, Generator
 
-    from fastapi import Request
     from pydantic.main import IncEx
     from starlette.datastructures import Headers
     from types_aiobotocore_meteringmarketplace.type_defs import (
@@ -99,6 +104,9 @@ class EventLog(TypedDict):
 #: Request HTTP headers
 REQUEST_HEADERS: ContextVar[Headers] = ContextVar("request_headers")
 
+#: HTTP request object
+REQUEST: ContextVar[Request] = ContextVar("request")
+
 #: Paths to ignore in logging
 LOGGING_PATHS_IGNORE = {
     "/",
@@ -162,6 +170,7 @@ def log_request_event(request: Request) -> Generator[EventLog]:
     REQUEST_ID.set(request_id)
     request_time = SETTINGS.now()
     REQUEST_TIME.set(request_time)
+    REQUEST.set(request)
     log = EventLog(
         type="request",
         level="info",
@@ -408,3 +417,58 @@ async def log_request_stream_event[T](stream: AsyncGenerator[T]) -> AsyncGenerat
         Items from the input asynchronous generator in their modified or original form.
     """
     return _rebuild_and_log_stream(await stream.__anext__(), stream)
+
+
+async def log_request_sse_stream_event(
+    stream: AsyncGenerator[JSONServerSentEvent],
+) -> AsyncGenerator[JSONServerSentEvent]:
+    """Log, monitor, and error-guard an SSE stream for use with ``EventSourceResponse``.
+
+    Combines :func:`log_request_stream_event` and an SSE error boundary into a
+    single step.  After the HTTP response headers are sent, any exception that
+    escapes the underlying generator cannot be turned into an HTTP error response
+    (Starlette raises ``RuntimeError: Caught handled exception, but response
+    already started``).  This wrapper catches such exceptions, logs them via
+    :func:`log_error_details`, and yields a terminal ``error`` SSE event
+    formatted for the matched API provider so that ``EventSourceResponse`` can
+    close the connection cleanly.
+
+    Args:
+        stream: Raw SSE async generator (e.g. from an adapter's ``format_stream``).
+
+    Yields:
+        Items from ``stream`` (after monitoring setup), followed by a provider-
+        formatted ``error`` SSE event on failure.
+    """
+    try:
+        async for chunk in _rebuild_and_log_stream(await stream.__anext__(), stream):
+            yield chunk
+    except ApiError as exc:
+        status = exc.status
+        log_error_details(exc.args[0], status=status)
+        yield JSONServerSentEvent(
+            data=format_http_error(
+                REQUEST.get(),
+                status,
+                hide_security_details(status, exc.args[0]),
+                exc.param,
+                exc.code,
+            )[0],
+            event="error",
+        )
+    except ClientError as exc:
+        error = exc.response["Error"]
+        status = AWS_ERROR_MAP.get(error["Code"], (502, "server_error"))[0]
+        log_error_details(error["Message"], status=status)
+        yield JSONServerSentEvent(
+            data=format_http_error(
+                REQUEST.get(), status, hide_security_details(status, error["Message"])
+            )[0],
+            event="error",
+        )
+    except Exception as exc:  # noqa: BLE001
+        log_error_details("\n".join(format_exception(exc)), level="critical")
+        yield JSONServerSentEvent(
+            data=format_http_error(REQUEST.get(), 500, "Internal Server Error")[0],
+            event="error",
+        )

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@`
`28`	`28`	`retries=_RETRIES,`
`29`	`29`	`max_pool_connections=_MAX_POOL_CONNECTIONS,`
`30`	`30`	`parameter_validation=False,`
	`31`	`+ read_timeout=SETTINGS.ai_response_timeout,`
`31`	`32`	`)`
`32`	`33`
`33`	`34`	`getLogger("aiobotocore").setLevel("CRITICAL")`