feat: vision fallback with intent

This commit is contained in:
Richard Tang
2026-04-30 13:02:57 -07:00
parent a0817fcde4
commit 73511a3c59
4 changed files with 134 additions and 30 deletions
+2 -19
View File
@@ -181,23 +181,6 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str:
return cleaned return cleaned
async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None:
"""Describe images for the injection-queue drain (no preceding tool call).
Wraps :func:`_captioning_chain` with a generic intent and returns
the caption inside an ``[image attached description: ]`` envelope
so the injected text reads as image content rather than free-form
prose.
"""
intent = "Describe the attached image(s) so a text-only agent can understand them."
result = await _captioning_chain(intent, image_content)
if not result:
return None
description, model = result
label = "image" if len(image_content) == 1 else f"{len(image_content)} images"
return f"[{label} attached — description: {description}]", model
def _vision_fallback_active(model: str | None) -> bool: def _vision_fallback_active(model: str | None) -> bool:
"""Return True if tool-result images for *model* should be routed """Return True if tool-result images for *model* should be routed
through the vision-fallback chain rather than sent to the model. through the vision-fallback chain rather than sent to the model.
@@ -3435,7 +3418,7 @@ class AgentLoop(AgentProtocol):
# single image needs captioning, this collapses to a # single image needs captioning, this collapses to a
# single await with no overhead. # single await with no overhead.
_model_text_only = ctx.llm and _vision_fallback_active(ctx.llm.model) _model_text_only = ctx.llm and _vision_fallback_active(ctx.llm.model)
caption_tasks: dict[str, asyncio.Task[str | None]] = {} caption_tasks: dict[str, asyncio.Task[tuple[str, str] | None]] = {}
if _model_text_only: if _model_text_only:
for tc in tool_calls[:executed_in_batch]: for tc in tool_calls[:executed_in_batch]:
res = results_by_id.get(tc.tool_use_id) res = results_by_id.get(tc.tool_use_id)
@@ -4139,7 +4122,7 @@ class AgentLoop(AgentProtocol):
queue=self._injection_queue, queue=self._injection_queue,
conversation=conversation, conversation=conversation,
ctx=ctx, ctx=ctx,
describe_images_as_text_fn=_describe_images_as_text, caption_image_fn=_captioning_chain,
) )
async def _drain_trigger_queue(self, conversation: NodeConversation) -> int: async def _drain_trigger_queue(self, conversation: NodeConversation) -> int:
@@ -162,9 +162,20 @@ async def drain_injection_queue(
conversation: NodeConversation, conversation: NodeConversation,
*, *,
ctx: NodeContext, ctx: NodeContext,
describe_images_as_text_fn: (Callable[[list[dict[str, Any]]], Awaitable[tuple[str, str] | None]] | None) = None, caption_image_fn: (
Callable[[str, list[dict[str, Any]]], Awaitable[tuple[str, str] | None]] | None
) = None,
) -> int: ) -> int:
"""Drain all pending injected events as user messages. Returns count.""" """Drain all pending injected events as user messages. Returns count.
``caption_image_fn`` is the unified vision fallback hook. It takes
``(intent, image_content)`` and returns ``(caption, model)`` on
success the model id is logged so the destination is observable.
The user's typed ``content`` (the injected message body) is passed
as the intent so the captioner can answer the user's specific
question about the image rather than producing a generic
description; an empty content falls back to a generic intent.
"""
count = 0 count = 0
logger.debug( logger.debug(
"[drain_injection_queue] Starting to drain queue, initial queue size: %s", "[drain_injection_queue] Starting to drain queue, initial queue size: %s",
@@ -184,10 +195,13 @@ async def drain_injection_queue(
"Model '%s' does not support images; attempting vision fallback", "Model '%s' does not support images; attempting vision fallback",
ctx.llm.model, ctx.llm.model,
) )
if describe_images_as_text_fn is not None: if caption_image_fn is not None:
described = await describe_images_as_text_fn(image_content) intent = content or (
if described: "Describe these user-injected images for a text-only agent."
description, vision_model = described )
caption_result = await caption_image_fn(intent, image_content)
if caption_result:
description, vision_model = caption_result
content = f"{content}\n\n{description}" if content else description content = f"{content}\n\n{description}" if content else description
logger.info( logger.info(
"[drain] image described as text via vision fallback (model '%s')", "[drain] image described as text via vision fallback (model '%s')",
@@ -196,16 +196,53 @@ async def caption_tool_image(
{"role": "user", "content": user_blocks}, {"role": "user", "content": user_blocks},
] ]
# Apply the same proxy rewrites the main LLM provider uses so a
# `hive/...` / `kimi/...` model resolves to the right Anthropic-
# compatible endpoint with the right auth header. Without this,
# litellm doesn't know what `hive/kimi-k2.5` is and rejects the call
# with "LLM Provider NOT provided."
from framework.llm.litellm import rewrite_proxy_model
rewritten_model, rewritten_base, extra_headers = rewrite_proxy_model(
model, api_key, api_base
)
kwargs: dict[str, Any] = { kwargs: dict[str, Any] = {
"model": model, "model": rewritten_model,
"messages": messages, "messages": messages,
"max_tokens": 1024, "max_tokens": 1024,
"timeout": timeout_s, "timeout": timeout_s,
} }
if api_key: # Pass api_key directly only when there are no proxy-rewritten
# extra_headers carrying the auth (e.g. the gemini-3-flash override
# path goes direct to Gemini, not through the Hive proxy).
if api_key and not extra_headers:
kwargs["api_key"] = api_key kwargs["api_key"] = api_key
if api_base: if rewritten_base:
kwargs["api_base"] = api_base kwargs["api_base"] = rewritten_base
if extra_headers:
kwargs["extra_headers"] = extra_headers
# Surface where the request is going so the user can verify the
# vision fallback is hitting the expected proxy / model. Redacts
# the API key to a length+head+tail digest so it can be cross-
# correlated with other auth-related log lines.
key_digest = (
f"len={len(api_key)} {api_key[:8]}{api_key[-4:]}"
if api_key and len(api_key) >= 12
else f"len={len(api_key) if api_key else 0}"
)
logger.info(
"[vision_fallback] dispatching: configured_model=%s rewritten_model=%s "
"api_base=%s api_key=%s images=%d intent_chars=%d timeout_s=%.1f",
model,
rewritten_model,
rewritten_base or "<litellm-default>",
key_digest,
len(image_content),
len(intent),
timeout_s,
)
started = datetime.now() started = datetime.now()
caption: str | None = None caption: str | None = None
@@ -215,9 +252,21 @@ async def caption_tool_image(
text = (response.choices[0].message.content or "").strip() text = (response.choices[0].message.content or "").strip()
if text: if text:
caption = text caption = text
logger.info(
"[vision_fallback] response: model=%s api_base=%s elapsed_s=%.2f chars=%d",
rewritten_model,
rewritten_base or "<litellm-default>",
(datetime.now() - started).total_seconds(),
len(text),
)
except Exception as exc: except Exception as exc:
error_text = f"{type(exc).__name__}: {exc}" error_text = f"{type(exc).__name__}: {exc}"
logger.debug("vision_fallback model '%s' failed: %s", model, exc) logger.warning(
"[vision_fallback] failed: model=%s api_base=%s error=%s",
rewritten_model,
rewritten_base or "<litellm-default>",
error_text,
)
# Best-effort audit log so users can grep ~/.hive/llm_logs/ for # Best-effort audit log so users can grep ~/.hive/llm_logs/ for
# vision-fallback subagent calls. Failures here must not bubble. # vision-fallback subagent calls. Failures here must not bubble.
+58
View File
@@ -211,6 +211,44 @@ def _ensure_ollama_chat_prefix(model: str) -> str:
return model return model
def rewrite_proxy_model(
model: str, api_key: str | None, api_base: str | None
) -> tuple[str, str | None, dict[str, str]]:
"""Apply Hive/Kimi proxy rewrites for any caller of ``litellm.acompletion``.
Both the Hive LLM proxy and Kimi For Coding expose Anthropic-API-
compatible endpoints. LiteLLM doesn't recognise the ``hive/`` or
``kimi/`` prefixes natively, so we rewrite them to ``anthropic/``
here. For the Hive proxy we also stamp a Bearer token into
``extra_headers`` because litellm's Anthropic handler only sends
``x-api-key`` and the proxy expects ``Authorization: Bearer``.
Used by ad-hoc ``litellm.acompletion`` callers (e.g. the vision-
fallback subagent in ``caption_tool_image``) so they hit the same
proxy with the same auth as the main agent's ``LiteLLMProvider``.
The provider's own ``__init__`` keeps its inlined rewrite for now —
this helper is the single source of truth for ad-hoc callers.
Returns: (rewritten_model, normalised_api_base, extra_headers).
The ``extra_headers`` dict is non-empty only for the Hive proxy
(and only when ``api_key`` is provided).
"""
extra_headers: dict[str, str] = {}
if model.lower().startswith("kimi/"):
model = "anthropic/" + model[len("kimi/") :]
if api_base and api_base.rstrip("/").endswith("/v1"):
api_base = api_base.rstrip("/")[:-3]
elif model.lower().startswith("hive/"):
model = "anthropic/" + model[len("hive/") :]
if api_base and api_base.rstrip("/").endswith("/v1"):
api_base = api_base.rstrip("/")[:-3]
# Hive proxy expects Bearer auth; litellm's Anthropic handler
# only sends x-api-key without this nudge.
if api_key:
extra_headers["Authorization"] = f"Bearer {api_key}"
return model, api_base, extra_headers
RATE_LIMIT_MAX_RETRIES = 10 RATE_LIMIT_MAX_RETRIES = 10
RATE_LIMIT_BACKOFF_BASE = 2 # seconds RATE_LIMIT_BACKOFF_BASE = 2 # seconds
RATE_LIMIT_MAX_DELAY = 120 # seconds - cap to prevent absurd waits RATE_LIMIT_MAX_DELAY = 120 # seconds - cap to prevent absurd waits
@@ -963,6 +1001,7 @@ class LiteLLMProvider(LLMProvider):
# Translate kimi/ prefix to anthropic/ so litellm uses the Anthropic # Translate kimi/ prefix to anthropic/ so litellm uses the Anthropic
# Messages API handler and routes to that endpoint — no special headers needed. # Messages API handler and routes to that endpoint — no special headers needed.
_original_model = model _original_model = model
self._hive_proxy_auth = bool(_original_model.lower().startswith("hive/"))
if _is_ollama_model(model): if _is_ollama_model(model):
model = _ensure_ollama_chat_prefix(model) model = _ensure_ollama_chat_prefix(model)
elif model.lower().startswith("kimi/"): elif model.lower().startswith("kimi/"):
@@ -1016,6 +1055,7 @@ class LiteLLMProvider(LLMProvider):
these attributes in-place propagates to all callers on the next LLM call. these attributes in-place propagates to all callers on the next LLM call.
""" """
_original_model = model _original_model = model
self._hive_proxy_auth = bool(_original_model.lower().startswith("hive/"))
if _is_ollama_model(model): if _is_ollama_model(model):
model = _ensure_ollama_chat_prefix(model) model = _ensure_ollama_chat_prefix(model)
elif model.lower().startswith("kimi/"): elif model.lower().startswith("kimi/"):
@@ -1255,6 +1295,16 @@ class LiteLLMProvider(LLMProvider):
# Ollama requires explicit tool_choice=auto for function calling # Ollama requires explicit tool_choice=auto for function calling
# so future readers don't have to guess. # so future readers don't have to guess.
kwargs.setdefault("tool_choice", "auto") kwargs.setdefault("tool_choice", "auto")
elif self._hive_proxy_auth:
# The Hive LLM proxy fronts GLM, which drifts into "explain
# the plan" mode on long-context turns instead of emitting
# tool_use blocks (verified 2026-04-28: tool_choice=null →
# text-only stop=stop; tool_choice=required → clean
# tool_use). Force a tool call when tools are available
# so queens can't get stuck in chat mode. Callers that
# legitimately want a non-tool turn can override via
# extra_kwargs.
kwargs.setdefault("tool_choice", "required")
# Add response_format for structured output # Add response_format for structured output
# LiteLLM passes this through to the underlying provider # LiteLLM passes this through to the underlying provider
@@ -1492,6 +1542,10 @@ class LiteLLMProvider(LLMProvider):
# Ollama requires explicit tool_choice=auto for function calling # Ollama requires explicit tool_choice=auto for function calling
# so future readers don't have to guess. # so future readers don't have to guess.
kwargs.setdefault("tool_choice", "auto") kwargs.setdefault("tool_choice", "auto")
elif self._hive_proxy_auth:
# See `complete()` for the rationale: GLM behind the Hive
# proxy needs forcing or it goes chat-mode on long contexts.
kwargs.setdefault("tool_choice", "required")
if response_format: if response_format:
kwargs["response_format"] = response_format kwargs["response_format"] = response_format
@@ -2276,6 +2330,10 @@ class LiteLLMProvider(LLMProvider):
# Ollama requires explicit tool_choice=auto for function calling # Ollama requires explicit tool_choice=auto for function calling
# so future readers don't have to guess. # so future readers don't have to guess.
kwargs.setdefault("tool_choice", "auto") kwargs.setdefault("tool_choice", "auto")
elif self._hive_proxy_auth:
# See `complete()` for the rationale: GLM behind the Hive
# proxy needs forcing or it goes chat-mode on long contexts.
kwargs.setdefault("tool_choice", "required")
if response_format: if response_format:
kwargs["response_format"] = response_format kwargs["response_format"] = response_format
# The Codex ChatGPT backend (Responses API) rejects several params. # The Codex ChatGPT backend (Responses API) rejects several params.