fix: harden codex responses transport

2026-04-01 12:34:27 +05:30
parent 34b9c33581
commit 92f07ce194
5 changed files with 86 additions and 24 deletions
@@ -108,10 +108,15 @@ class CodexResponsesAdapter:
        system_messages: list[dict[str, Any]] = []
        if system:
            chunks = self.chunk_system_prompt(system)
-            if len(chunks) > 1 or len(chunks[0]) > _CODEX_SYSTEM_CHUNK_CHARS:
-                system_messages.append({"role": "system", "content": _CODEX_SYSTEM_PREAMBLE})
-            for chunk in chunks:
-                system_messages.append({"role": "system", "content": chunk})
+            if chunks:
+                if len(chunks) > 1 or len(chunks[0]) > _CODEX_SYSTEM_CHUNK_CHARS:
+                    system_messages.append({"role": "system", "content": _CODEX_SYSTEM_PREAMBLE})
+                for chunk in chunks:
+                    system_messages.append({"role": "system", "content": chunk})
+            else:
+                system_messages.append(
+                    {"role": "system", "content": "You are a helpful assistant."}
+                )
        else:
            system_messages.append({"role": "system", "content": "You are a helpful assistant."})

@@ -24,7 +24,7 @@ def is_codex_api_base(api_base: str | None) -> bool:
    parsed = urlparse(api_base)
    path = parsed.path.rstrip("/")
    return (
-        parsed.scheme in {"http", "https"}
+        parsed.scheme == "https"
        and parsed.hostname == _CODEX_HOST
        and (path == _CODEX_PATH or path == f"{_CODEX_PATH}/responses")
    )
@@ -16,6 +16,7 @@ import os
 import re
 import time
 from collections.abc import AsyncIterator
+from contextlib import contextmanager
 from datetime import datetime
 from pathlib import Path
 from typing import Any
@@ -557,18 +558,6 @@ class LiteLLMProvider(LLMProvider):
                "LiteLLM is not installed. Please install it with: uv pip install litellm"
            )

-        # The Codex ChatGPT backend is a Responses API endpoint at
-        # chatgpt.com/backend-api/codex/responses.  LiteLLM's model registry
-        # marks legacy codex models (gpt-5.3-codex) with mode="responses",
-        # but newer models like gpt-5.4 default to mode="chat".  Force
-        # mode="responses" so litellm routes through the responses_api_bridge.
-        if self._codex_backend and litellm is not None:
-            _strip = self.model.removeprefix("openai/")
-            _entry = litellm.model_cost.get(_strip, {})
-            if _entry.get("mode") != "responses":
-                litellm.model_cost.setdefault(_strip, {})
-                litellm.model_cost[_strip]["mode"] = "responses"
-
    @staticmethod
    def _default_api_base_for_model(model: str) -> str | None:
        """Return provider-specific default API base when required."""
@@ -592,6 +581,33 @@ class LiteLLMProvider(LLMProvider):
        """Break large system prompts into smaller Codex-friendly chunks."""
        return self._codex_adapter.chunk_system_prompt(system)

+    @contextmanager
+    def _codex_responses_mode_override(self, model: str | None = None):
+        """Temporarily route Codex requests through LiteLLM's Responses bridge."""
+        if not self._codex_backend or litellm is None:
+            yield
+            return
+
+        stripped_model = (model or self.model).removeprefix("openai/")
+        previous_entry = litellm.model_cost.get(stripped_model)
+        previous_mode = previous_entry.get("mode") if isinstance(previous_entry, dict) else None
+        if previous_mode == "responses":
+            yield
+            return
+
+        if previous_entry is None:
+            litellm.model_cost[stripped_model] = {"mode": "responses"}
+        else:
+            litellm.model_cost[stripped_model] = {**previous_entry, "mode": "responses"}
+
+        try:
+            yield
+        finally:
+            if previous_entry is None:
+                litellm.model_cost.pop(stripped_model, None)
+            else:
+                litellm.model_cost[stripped_model] = previous_entry
+
    def _build_request_messages(
        self,
        messages: list[dict[str, Any]],
@@ -719,7 +735,8 @@ class LiteLLMProvider(LLMProvider):
        retries = max_retries if max_retries is not None else RATE_LIMIT_MAX_RETRIES
        for attempt in range(retries + 1):
            try:
-                response = litellm.completion(**kwargs)  # type: ignore[union-attr]
+                with self._codex_responses_mode_override(model):
+                    response = litellm.completion(**kwargs)  # type: ignore[union-attr]

                # Some providers (e.g. Gemini) return 200 with empty content on
                # rate limit / quota exhaustion instead of a proper 429.  Treat
@@ -899,7 +916,8 @@ class LiteLLMProvider(LLMProvider):
        retries = max_retries if max_retries is not None else RATE_LIMIT_MAX_RETRIES
        for attempt in range(retries + 1):
            try:
-                response = await litellm.acompletion(**kwargs)  # type: ignore[union-attr]
+                with self._codex_responses_mode_override(model):
+                    response = await litellm.acompletion(**kwargs)  # type: ignore[union-attr]

                content = response.choices[0].message.content if response.choices else None
                has_tool_calls = bool(response.choices and response.choices[0].message.tool_calls)
@@ -1824,10 +1842,15 @@ class LiteLLMProvider(LLMProvider):
        """Try a non-stream completion when Codex returns an empty stream."""
        if not self._codex_backend:
            return None
+
+        async def _codex_acompletion(**fallback_kwargs: Any) -> Any:
+            with self._codex_responses_mode_override(fallback_kwargs.get("model")):
+                return await litellm.acompletion(**fallback_kwargs)  # type: ignore[union-attr]
+
        return await self._codex_adapter.recover_empty_stream(
            kwargs,
            last_role=last_role,
-            acompletion=litellm.acompletion,  # type: ignore[union-attr]
+            acompletion=_codex_acompletion,
        )

    def _merge_tool_call_chunk(
@@ -1914,7 +1937,8 @@ class LiteLLMProvider(LLMProvider):
            stream_finish_reason: str | None = None

            try:
-                response = await litellm.acompletion(**kwargs)  # type: ignore[union-attr]
+                with self._codex_responses_mode_override(kwargs.get("model")):
+                    response = await litellm.acompletion(**kwargs)  # type: ignore[union-attr]

                async for chunk in response:
                    # Capture usage from the trailing usage-only chunk that
@@ -115,6 +115,7 @@ class TestCodexConfig:
    def test_codex_api_base_detection_requires_real_chatgpt_origin(self):
        assert is_codex_api_base("https://chatgpt.com/backend-api/codex")
        assert is_codex_api_base("https://chatgpt.com/backend-api/codex/responses")
+        assert not is_codex_api_base("http://chatgpt.com/backend-api/codex")
        assert not is_codex_api_base(
            "https://proxy.example/v1?target=https://chatgpt.com/backend-api/codex"
        )
@@ -918,8 +918,8 @@ class TestCodexEmptyStreamRecovery:


 class TestCodexRequestHardening:
-    def test_codex_backend_forces_responses_mode_for_newer_models(self):
-        """Codex backend should force LiteLLM through the Responses bridge."""
+    def test_codex_backend_forces_responses_mode_per_request(self):
+        """Codex backend should scope the Responses override to the active request."""
        import litellm

        original = litellm.model_cost.get("gpt-5.4")
@@ -932,7 +932,27 @@ class TestCodexRequestHardening:
                api_base="https://chatgpt.com/backend-api/codex",
            )
            assert provider._codex_backend is True
-            assert litellm.model_cost["gpt-5.4"]["mode"] == "responses"
+
+            async def _fake_acompletion(**kwargs):
+                assert kwargs["model"] == "openai/gpt-5.4"
+                assert litellm.model_cost["gpt-5.4"]["mode"] == "responses"
+                response = MagicMock()
+                response.choices = [MagicMock()]
+                response.choices[0].message.content = "ok"
+                response.choices[0].message.tool_calls = []
+                response.choices[0].finish_reason = "stop"
+                return response
+
+            with patch("litellm.acompletion", new=_fake_acompletion):
+                response = asyncio.run(
+                    provider._acompletion_with_rate_limit_retry(
+                        model=provider.model,
+                        messages=[{"role": "user", "content": "hi"}],
+                    )
+                )
+
+            assert response.choices[0].message.content == "ok"
+            assert litellm.model_cost["gpt-5.4"]["mode"] == "chat"
        finally:
            if original is None:
                litellm.model_cost.pop("gpt-5.4", None)
@@ -973,6 +993,18 @@ class TestCodexRequestHardening:
        assert kwargs["api_base"] == "https://chatgpt.com/backend-api/codex"
        assert "store" in kwargs["allowed_openai_params"]

+    def test_codex_build_system_messages_handles_whitespace_only_prompt(self):
+        """Whitespace-only system prompts should fall back to the default prompt."""
+        provider = LiteLLMProvider(
+            model="openai/gpt-5.4",
+            api_key="test-key",
+            api_base="https://chatgpt.com/backend-api/codex",
+        )
+
+        messages = provider._codex_adapter.build_system_messages("\n   \n", json_mode=False)
+
+        assert messages == [{"role": "system", "content": "You are a helpful assistant."}]
+
    def test_codex_merge_tool_call_chunk_handles_parallel_calls_with_broken_indexes(self):
        """Codex chunk merging should survive index=0 for multiple parallel tool calls."""
        from types import SimpleNamespace