fix: harden codex responses transport

This commit is contained in:
Vasu Bansal
2026-04-01 12:34:27 +05:30
parent 34b9c33581
commit 92f07ce194
5 changed files with 86 additions and 24 deletions
+9 -4
View File
@@ -108,10 +108,15 @@ class CodexResponsesAdapter:
system_messages: list[dict[str, Any]] = []
if system:
chunks = self.chunk_system_prompt(system)
if len(chunks) > 1 or len(chunks[0]) > _CODEX_SYSTEM_CHUNK_CHARS:
system_messages.append({"role": "system", "content": _CODEX_SYSTEM_PREAMBLE})
for chunk in chunks:
system_messages.append({"role": "system", "content": chunk})
if chunks:
if len(chunks) > 1 or len(chunks[0]) > _CODEX_SYSTEM_CHUNK_CHARS:
system_messages.append({"role": "system", "content": _CODEX_SYSTEM_PREAMBLE})
for chunk in chunks:
system_messages.append({"role": "system", "content": chunk})
else:
system_messages.append(
{"role": "system", "content": "You are a helpful assistant."}
)
else:
system_messages.append({"role": "system", "content": "You are a helpful assistant."})
+1 -1
View File
@@ -24,7 +24,7 @@ def is_codex_api_base(api_base: str | None) -> bool:
parsed = urlparse(api_base)
path = parsed.path.rstrip("/")
return (
parsed.scheme in {"http", "https"}
parsed.scheme == "https"
and parsed.hostname == _CODEX_HOST
and (path == _CODEX_PATH or path == f"{_CODEX_PATH}/responses")
)
+40 -16
View File
@@ -16,6 +16,7 @@ import os
import re
import time
from collections.abc import AsyncIterator
from contextlib import contextmanager
from datetime import datetime
from pathlib import Path
from typing import Any
@@ -557,18 +558,6 @@ class LiteLLMProvider(LLMProvider):
"LiteLLM is not installed. Please install it with: uv pip install litellm"
)
# The Codex ChatGPT backend is a Responses API endpoint at
# chatgpt.com/backend-api/codex/responses. LiteLLM's model registry
# marks legacy codex models (gpt-5.3-codex) with mode="responses",
# but newer models like gpt-5.4 default to mode="chat". Force
# mode="responses" so litellm routes through the responses_api_bridge.
if self._codex_backend and litellm is not None:
_strip = self.model.removeprefix("openai/")
_entry = litellm.model_cost.get(_strip, {})
if _entry.get("mode") != "responses":
litellm.model_cost.setdefault(_strip, {})
litellm.model_cost[_strip]["mode"] = "responses"
@staticmethod
def _default_api_base_for_model(model: str) -> str | None:
"""Return provider-specific default API base when required."""
@@ -592,6 +581,33 @@ class LiteLLMProvider(LLMProvider):
"""Break large system prompts into smaller Codex-friendly chunks."""
return self._codex_adapter.chunk_system_prompt(system)
@contextmanager
def _codex_responses_mode_override(self, model: str | None = None):
"""Temporarily route Codex requests through LiteLLM's Responses bridge."""
if not self._codex_backend or litellm is None:
yield
return
stripped_model = (model or self.model).removeprefix("openai/")
previous_entry = litellm.model_cost.get(stripped_model)
previous_mode = previous_entry.get("mode") if isinstance(previous_entry, dict) else None
if previous_mode == "responses":
yield
return
if previous_entry is None:
litellm.model_cost[stripped_model] = {"mode": "responses"}
else:
litellm.model_cost[stripped_model] = {**previous_entry, "mode": "responses"}
try:
yield
finally:
if previous_entry is None:
litellm.model_cost.pop(stripped_model, None)
else:
litellm.model_cost[stripped_model] = previous_entry
def _build_request_messages(
self,
messages: list[dict[str, Any]],
@@ -719,7 +735,8 @@ class LiteLLMProvider(LLMProvider):
retries = max_retries if max_retries is not None else RATE_LIMIT_MAX_RETRIES
for attempt in range(retries + 1):
try:
response = litellm.completion(**kwargs) # type: ignore[union-attr]
with self._codex_responses_mode_override(model):
response = litellm.completion(**kwargs) # type: ignore[union-attr]
# Some providers (e.g. Gemini) return 200 with empty content on
# rate limit / quota exhaustion instead of a proper 429. Treat
@@ -899,7 +916,8 @@ class LiteLLMProvider(LLMProvider):
retries = max_retries if max_retries is not None else RATE_LIMIT_MAX_RETRIES
for attempt in range(retries + 1):
try:
response = await litellm.acompletion(**kwargs) # type: ignore[union-attr]
with self._codex_responses_mode_override(model):
response = await litellm.acompletion(**kwargs) # type: ignore[union-attr]
content = response.choices[0].message.content if response.choices else None
has_tool_calls = bool(response.choices and response.choices[0].message.tool_calls)
@@ -1824,10 +1842,15 @@ class LiteLLMProvider(LLMProvider):
"""Try a non-stream completion when Codex returns an empty stream."""
if not self._codex_backend:
return None
async def _codex_acompletion(**fallback_kwargs: Any) -> Any:
with self._codex_responses_mode_override(fallback_kwargs.get("model")):
return await litellm.acompletion(**fallback_kwargs) # type: ignore[union-attr]
return await self._codex_adapter.recover_empty_stream(
kwargs,
last_role=last_role,
acompletion=litellm.acompletion, # type: ignore[union-attr]
acompletion=_codex_acompletion,
)
def _merge_tool_call_chunk(
@@ -1914,7 +1937,8 @@ class LiteLLMProvider(LLMProvider):
stream_finish_reason: str | None = None
try:
response = await litellm.acompletion(**kwargs) # type: ignore[union-attr]
with self._codex_responses_mode_override(kwargs.get("model")):
response = await litellm.acompletion(**kwargs) # type: ignore[union-attr]
async for chunk in response:
# Capture usage from the trailing usage-only chunk that
+1
View File
@@ -115,6 +115,7 @@ class TestCodexConfig:
def test_codex_api_base_detection_requires_real_chatgpt_origin(self):
assert is_codex_api_base("https://chatgpt.com/backend-api/codex")
assert is_codex_api_base("https://chatgpt.com/backend-api/codex/responses")
assert not is_codex_api_base("http://chatgpt.com/backend-api/codex")
assert not is_codex_api_base(
"https://proxy.example/v1?target=https://chatgpt.com/backend-api/codex"
)
+35 -3
View File
@@ -918,8 +918,8 @@ class TestCodexEmptyStreamRecovery:
class TestCodexRequestHardening:
def test_codex_backend_forces_responses_mode_for_newer_models(self):
"""Codex backend should force LiteLLM through the Responses bridge."""
def test_codex_backend_forces_responses_mode_per_request(self):
"""Codex backend should scope the Responses override to the active request."""
import litellm
original = litellm.model_cost.get("gpt-5.4")
@@ -932,7 +932,27 @@ class TestCodexRequestHardening:
api_base="https://chatgpt.com/backend-api/codex",
)
assert provider._codex_backend is True
assert litellm.model_cost["gpt-5.4"]["mode"] == "responses"
async def _fake_acompletion(**kwargs):
assert kwargs["model"] == "openai/gpt-5.4"
assert litellm.model_cost["gpt-5.4"]["mode"] == "responses"
response = MagicMock()
response.choices = [MagicMock()]
response.choices[0].message.content = "ok"
response.choices[0].message.tool_calls = []
response.choices[0].finish_reason = "stop"
return response
with patch("litellm.acompletion", new=_fake_acompletion):
response = asyncio.run(
provider._acompletion_with_rate_limit_retry(
model=provider.model,
messages=[{"role": "user", "content": "hi"}],
)
)
assert response.choices[0].message.content == "ok"
assert litellm.model_cost["gpt-5.4"]["mode"] == "chat"
finally:
if original is None:
litellm.model_cost.pop("gpt-5.4", None)
@@ -973,6 +993,18 @@ class TestCodexRequestHardening:
assert kwargs["api_base"] == "https://chatgpt.com/backend-api/codex"
assert "store" in kwargs["allowed_openai_params"]
def test_codex_build_system_messages_handles_whitespace_only_prompt(self):
"""Whitespace-only system prompts should fall back to the default prompt."""
provider = LiteLLMProvider(
model="openai/gpt-5.4",
api_key="test-key",
api_base="https://chatgpt.com/backend-api/codex",
)
messages = provider._codex_adapter.build_system_messages("\n \n", json_mode=False)
assert messages == [{"role": "system", "content": "You are a helpful assistant."}]
def test_codex_merge_tool_call_chunk_handles_parallel_calls_with_broken_indexes(self):
"""Codex chunk merging should survive index=0 for multiple parallel tool calls."""
from types import SimpleNamespace