feat: vision model retry and fallback

This commit is contained in:
Richard Tang
2026-04-30 12:38:30 -07:00
parent 628ce9ca12
commit a0817fcde4
3 changed files with 51 additions and 73 deletions
+26 -56
View File
@@ -14,7 +14,6 @@ from __future__ import annotations
import asyncio import asyncio
import json import json
import logging import logging
import os
import re import re
import time import time
import uuid import uuid
@@ -183,50 +182,20 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str:
async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None: async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None:
"""Describe images using the best available vision model. """Describe images for the injection-queue drain (no preceding tool call).
Returns ``(description, model)`` on success the formatted Wraps :func:`_captioning_chain` with a generic intent and returns
placeholder text plus the model id that produced it or ``None`` the caption inside an ``[image attached description: ]`` envelope
when every candidate fails or no API key is configured. so the injected text reads as image content rather than free-form
prose.
""" """
import litellm intent = "Describe the attached image(s) so a text-only agent can understand them."
result = await _captioning_chain(intent, image_content)
blocks: list[dict[str, Any]] = [ if not result:
{ return None
"type": "text", description, model = result
"text": ( label = "image" if len(image_content) == 1 else f"{len(image_content)} images"
"Describe the following image(s) concisely but with enough detail " return f"[{label} attached — description: {description}]", model
"that a text-only AI assistant can understand the content and context."
),
}
]
blocks.extend(image_content)
candidates: list[str] = []
if os.environ.get("OPENAI_API_KEY"):
candidates.append("gpt-4o-mini")
if os.environ.get("ANTHROPIC_API_KEY"):
candidates.append("claude-3-haiku-20240307")
if os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY"):
candidates.append("gemini/gemini-1.5-flash")
for model in candidates:
try:
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": blocks}],
max_tokens=512,
)
description = (response.choices[0].message.content or "").strip()
if description:
count = len(image_content)
label = "image" if count == 1 else f"{count} images"
return f"[{label} attached — description: {description}]", model
except Exception as exc:
logger.debug("Vision fallback model '%s' failed: %s", model, exc)
continue
return None
def _vision_fallback_active(model: str | None) -> bool: def _vision_fallback_active(model: str | None) -> bool:
@@ -253,21 +222,22 @@ async def _captioning_chain(
intent: str, intent: str,
image_content: list[dict[str, Any]], image_content: list[dict[str, Any]],
) -> tuple[str, str] | None: ) -> tuple[str, str] | None:
"""Two-stage caption chain used by the agent-loop tool-result hook. """Configured vision_fallback → retry → ``gemini/gemini-3-flash-preview``.
Stage 1: configured ``vision_fallback`` model with intent + images. The Gemini override reuses the configured ``api_key`` / ``api_base``,
Stage 2: generic-caption rotation (gpt-4o-mini claude-3-haiku so a Hive subscriber (whose token routes to a multi-model proxy)
gemini-flash) when stage 1 is unconfigured or fails. keeps coverage when their primary model glitches. Without
configured creds litellm falls through to env-based Gemini auth;
Returns ``(caption, model)`` the caption text paired with the users with neither Hive nor a ``GEMINI_API_KEY`` simply lose the
model id that produced it or ``None`` if both stages fail. third try.
Caller is responsible for the placeholder-on-None and the splice
into the persisted tool-result content.
""" """
result = await caption_tool_image(intent, image_content) if result := await caption_tool_image(intent, image_content):
if not result: return result
result = await _describe_images_as_text(image_content) logger.warning("vision_fallback failed; retrying configured model")
return result if result := await caption_tool_image(intent, image_content):
return result
logger.warning("vision_fallback retry failed; trying gemini-3-flash-preview")
return await caption_tool_image(intent, image_content, model_override="gemini/gemini-3-flash-preview")
# Pattern for detecting context-window-exceeded errors across LLM providers. # Pattern for detecting context-window-exceeded errors across LLM providers.
@@ -18,9 +18,10 @@ This module provides:
Both helpers degrade silently return ``None`` / a placeholder rather Both helpers degrade silently return ``None`` / a placeholder rather
than raise so a vision-fallback failure can never kill the main than raise so a vision-fallback failure can never kill the main
agent's run. The agent-loop call site is responsible for chaining agent's run. The agent-loop call site retries the configured model
through to the existing generic-caption rotation once on a None return, then falls back to
(``_describe_images_as_text``) on a None return. ``gemini/gemini-3-flash-preview`` via the ``model_override`` parameter
of :func:`caption_tool_image`.
""" """
from __future__ import annotations from __future__ import annotations
@@ -156,25 +157,30 @@ async def caption_tool_image(
image_content: list[dict[str, Any]], image_content: list[dict[str, Any]],
*, *,
timeout_s: float = 30.0, timeout_s: float = 30.0,
model_override: str | None = None,
) -> tuple[str, str] | None: ) -> tuple[str, str] | None:
"""Caption the given images using the configured ``vision_fallback`` model. """Caption the given images using the configured ``vision_fallback`` model.
Returns ``(caption, model)`` on success the model's text response Returns ``(caption, model)`` on success or ``None`` on any failure
paired with the model id that produced it or ``None`` on any (no config, no API key, timeout, exception, empty response).
failure (no config, no API key, timeout, exception, empty
response). Callers chain to the next stage of the fallback on None.
Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn`` so the ``model_override`` swaps in a different litellm model id while
cost / latency / quality are auditable post-hoc, tagged with keeping the configured ``vision_fallback`` ``api_key`` / ``api_base``
``execution_id="vision_fallback_subagent"``. untouched. That's deliberate: Hive subscribers configure
``vision_fallback`` to point at the Hive proxy, which routes to
multiple models including Gemini so reusing the credentials lets
a Gemini-3-flash override still work without a separate
``GEMINI_API_KEY``. When no creds are configured, litellm falls
back to env-var resolution.
Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn``.
""" """
model = get_vision_fallback_model() model = model_override or get_vision_fallback_model()
if not model: if not model:
return None return None
api_key = get_vision_fallback_api_key() api_key = get_vision_fallback_api_key()
api_base = get_vision_fallback_api_base() api_base = get_vision_fallback_api_base()
if not api_key: if not api_key and not model_override:
logger.debug("vision_fallback configured but no API key resolved; skipping") logger.debug("vision_fallback configured but no API key resolved; skipping")
return None return None
@@ -195,8 +201,9 @@ async def caption_tool_image(
"messages": messages, "messages": messages,
"max_tokens": 1024, "max_tokens": 1024,
"timeout": timeout_s, "timeout": timeout_s,
"api_key": api_key,
} }
if api_key:
kwargs["api_key"] = api_key
if api_base: if api_base:
kwargs["api_base"] = api_base kwargs["api_base"] = api_base
+4 -3
View File
@@ -162,9 +162,10 @@ def get_vision_fallback_model() -> str | None:
Used by the agent-loop hook that captions tool-result images when the Used by the agent-loop hook that captions tool-result images when the
main agent's model cannot accept image content (text-only LLMs). main agent's model cannot accept image content (text-only LLMs).
When this returns None the fallback chain skips the configured-subagent When this returns None the captioning chain's configured + retry
stage and proceeds straight to the generic caption rotation attempts both no-op (returning None), and only the final
(``_describe_images_as_text``). ``gemini/gemini-3-flash-preview`` override has a chance to succeed
and only if a ``GEMINI_API_KEY`` is set in the environment.
""" """
vision = get_hive_config().get("vision_fallback", {}) vision = get_hive_config().get("vision_fallback", {})
if vision.get("provider") and vision.get("model"): if vision.get("provider") and vision.get("model"):