diff --git a/litellm/litellm_core_utils/core_helpers.py b/litellm/litellm_core_utils/core_helpers.py index e984df82140..98b792efa59 100644 --- a/litellm/litellm_core_utils/core_helpers.py +++ b/litellm/litellm_core_utils/core_helpers.py @@ -242,9 +242,28 @@ def _get_parent_otel_span_from_kwargs( return None -def process_response_headers(response_headers: Union[httpx.Headers, dict]) -> dict: +def process_response_headers( + response_headers: Union[httpx.Headers, dict], + preserve_litellm_internal_headers: bool = False, +) -> dict: + """ + `preserve_litellm_internal_headers` must only be True when the input is a + LiteLLM-owned dict (e.g. `_hidden_params["additional_headers"]` that has + already been through one round of processing). For raw upstream provider + headers — whether passed as `httpx.Headers` or a plain dict — it must + remain False, otherwise a malicious provider returning `x-litellm-*` could + spoof LiteLLM-internal markers (e.g. `x-litellm-attempted-fallbacks`). + + When the input is an `httpx.Headers` object the flag is always treated as + False regardless of what the caller requested, because `httpx.Headers` is + always a raw provider response and can never be LiteLLM-owned. + """ from litellm.types.utils import OPENAI_RESPONSE_HEADERS + # Raw httpx.Headers objects come directly from provider HTTP responses and + # must never be treated as LiteLLM-owned, regardless of caller intent. + _preserve = preserve_litellm_internal_headers and isinstance(response_headers, dict) + openai_headers = {} processed_headers = {} additional_headers = {} @@ -256,6 +275,12 @@ def process_response_headers(response_headers: Union[httpx.Headers, dict]) -> di "llm_provider-" ): # return raw provider headers (incl. openai-compatible ones) processed_headers[k] = v + elif _preserve and k.startswith("x-litellm-"): + # LiteLLM's own internal headers (e.g. x-litellm-attempted-fallbacks, + # x-litellm-model-group) are not LLM provider headers and must not be + # prefixed. Downstream consumers (proxy override, callers checking + # whether a fallback happened) look up the bare key. + processed_headers[k] = v else: additional_headers["{}-{}".format("llm_provider", k)] = v diff --git a/litellm/litellm_core_utils/fallback_utils.py b/litellm/litellm_core_utils/fallback_utils.py index daacca85c8a..1606b53e1f9 100644 --- a/litellm/litellm_core_utils/fallback_utils.py +++ b/litellm/litellm_core_utils/fallback_utils.py @@ -7,6 +7,9 @@ safe_deep_copy, filter_internal_params, ) +from litellm.router_utils.add_retry_fallback_headers import ( + add_fallback_headers_to_response, +) from .asyncify import run_async_function @@ -42,7 +45,7 @@ async def async_completion_with_fallbacks(**kwargs): # Try each fallback model most_recent_exception_str: Optional[str] = None - for fallback in fallbacks: + for attempted_fallbacks, fallback in enumerate(fallbacks): try: completion_kwargs = safe_deep_copy(base_kwargs) # Handle dictionary fallback configurations @@ -63,7 +66,10 @@ async def async_completion_with_fallbacks(**kwargs): ) if response is not None: - return response + return add_fallback_headers_to_response( + response=response, + attempted_fallbacks=attempted_fallbacks, + ) except Exception as e: verbose_logger.exception( diff --git a/litellm/litellm_core_utils/llm_response_utils/response_metadata.py b/litellm/litellm_core_utils/llm_response_utils/response_metadata.py index 06933a6fbcb..ba870eb9459 100644 --- a/litellm/litellm_core_utils/llm_response_utils/response_metadata.py +++ b/litellm/litellm_core_utils/llm_response_utils/response_metadata.py @@ -49,7 +49,8 @@ def set_hidden_params( result=self.result, litellm_model_name=model, router_model_id=model_id ), "additional_headers": process_response_headers( - self._get_value_from_hidden_params("additional_headers") or {} + self._get_value_from_hidden_params("additional_headers") or {}, + preserve_litellm_internal_headers=True, ), "litellm_model_name": model, } diff --git a/tests/test_litellm/litellm_core_utils/test_fallback_utils.py b/tests/test_litellm/litellm_core_utils/test_fallback_utils.py index 0c542ff6a1b..90a61696e9d 100644 --- a/tests/test_litellm/litellm_core_utils/test_fallback_utils.py +++ b/tests/test_litellm/litellm_core_utils/test_fallback_utils.py @@ -1,7 +1,13 @@ +"""Tests for litellm.litellm_core_utils.fallback_utils.""" + import pytest +import httpx import litellm -from litellm.litellm_core_utils.fallback_utils import async_completion_with_fallbacks +from litellm.litellm_core_utils.core_helpers import process_response_headers +from litellm.litellm_core_utils.fallback_utils import ( + async_completion_with_fallbacks, +) @pytest.mark.asyncio @@ -41,3 +47,123 @@ async def _fake_acompletion(*, model: str, **kwargs): "primary-model", "fallback-model", ] + + +@pytest.mark.asyncio +async def test_async_completion_with_fallbacks_sets_attempted_fallbacks_header(): + """ + When a fallback succeeds, the response must carry the + `x-litellm-attempted-fallbacks` header so the proxy and other callers can + detect that a fallback occurred. Without it, + `_override_openai_response_model` stamps the requested model back over the + fallback model used. See issue #28241. + """ + response = await async_completion_with_fallbacks( + model="openai/primary-llm", + messages=[{"role": "user", "content": "hi"}], + api_key="fake-key", + mock_response=Exception("forced failure"), + kwargs={ + "fallbacks": [ + { + "model": "openai/backup-llm", + "api_key": "fake-key", + "mock_response": "backup-resp", + } + ] + }, + ) + + hidden_params = getattr(response, "_hidden_params", None) + assert isinstance(hidden_params, dict) + headers = hidden_params.get("additional_headers") or {} + assert headers.get("x-litellm-attempted-fallbacks") == 1 + + +@pytest.mark.asyncio +async def test_async_completion_with_fallbacks_header_is_zero_when_primary_succeeds(): + """ + When the primary model succeeds on the first attempt, the header should be + `0` (no fallback was used). This mirrors the existing router-level + semantics in `async_function_with_fallbacks`. + """ + response = await async_completion_with_fallbacks( + model="openai/primary-llm", + messages=[{"role": "user", "content": "hi"}], + api_key="fake-key", + mock_response="primary-resp", + kwargs={ + "fallbacks": [ + { + "model": "openai/backup-llm", + "api_key": "fake-key", + "mock_response": "backup-resp", + } + ] + }, + ) + + hidden_params = getattr(response, "_hidden_params", None) + assert isinstance(hidden_params, dict) + headers = hidden_params.get("additional_headers") or {} + assert headers.get("x-litellm-attempted-fallbacks") == 0 + assert response.choices[0].message.content == "primary-resp" + + +def test_process_response_headers_preserves_x_litellm_headers_when_internal(): + """ + `process_response_headers` must not add the `llm_provider-` prefix to + LiteLLM's own internal headers (anything starting with `x-litellm-`) when + the caller has marked the input as LiteLLM-owned. These are markers set by + LiteLLM (e.g. fallback / retry headers); the proxy and other callers look + up the bare key. + """ + result = process_response_headers( + { + "x-litellm-attempted-fallbacks": 1, + "x-litellm-model-group": "gpt-4", + "x-stainless-arch": "arm64", + }, + preserve_litellm_internal_headers=True, + ) + assert result["x-litellm-attempted-fallbacks"] == 1 + assert result["x-litellm-model-group"] == "gpt-4" + assert result["llm_provider-x-stainless-arch"] == "arm64" + + +def test_process_response_headers_prefixes_x_litellm_from_raw_provider(): + """ + On raw upstream-provider headers (default `preserve_litellm_internal_headers=False`), + a header whose name starts with `x-litellm-` MUST still get the + `llm_provider-` prefix. Otherwise a malicious provider could return + `x-litellm-attempted-fallbacks` and spoof a LiteLLM-internal marker, + bypassing the proxy model-override guard. + """ + result = process_response_headers( + { + "x-litellm-attempted-fallbacks": 99, + "x-stainless-arch": "arm64", + } + ) + assert "x-litellm-attempted-fallbacks" not in result + assert result["llm_provider-x-litellm-attempted-fallbacks"] == 99 + assert result["llm_provider-x-stainless-arch"] == "arm64" + + +def test_process_response_headers_ignores_preserve_flag_for_httpx_headers(): + """ + Some providers store raw httpx.Headers directly in _hidden_params["additional_headers"] + without a prior normalization pass. If preserve_litellm_internal_headers=True were + honored for httpx.Headers inputs, a provider returning x-litellm-attempted-fallbacks + could spoof it as a bare LiteLLM-internal marker and make the proxy skip + stamping the correct response model. The flag must be ignored for httpx.Headers. + """ + raw = httpx.Headers( + { + "x-litellm-attempted-fallbacks": "1", + "content-type": "application/json", + } + ) + result = process_response_headers(raw, preserve_litellm_internal_headers=True) + assert "x-litellm-attempted-fallbacks" not in result + assert result["llm_provider-x-litellm-attempted-fallbacks"] == "1"