feat: vision model retry and fallback

This commit is contained in:
Richard Tang
2026-04-30 12:38:30 -07:00
parent 628ce9ca12
commit a0817fcde4
3 changed files with 51 additions and 73 deletions
+26 -56
View File
@@ -14,7 +14,6 @@ from __future__ import annotations
import asyncio
import json
import logging
import os
import re
import time
import uuid
@@ -183,50 +182,20 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str:
async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None:
"""Describe images using the best available vision model.
"""Describe images for the injection-queue drain (no preceding tool call).
Returns ``(description, model)`` on success the formatted
placeholder text plus the model id that produced it or ``None``
when every candidate fails or no API key is configured.
Wraps :func:`_captioning_chain` with a generic intent and returns
the caption inside an ``[image attached description: ]`` envelope
so the injected text reads as image content rather than free-form
prose.
"""
import litellm
blocks: list[dict[str, Any]] = [
{
"type": "text",
"text": (
"Describe the following image(s) concisely but with enough detail "
"that a text-only AI assistant can understand the content and context."
),
}
]
blocks.extend(image_content)
candidates: list[str] = []
if os.environ.get("OPENAI_API_KEY"):
candidates.append("gpt-4o-mini")
if os.environ.get("ANTHROPIC_API_KEY"):
candidates.append("claude-3-haiku-20240307")
if os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY"):
candidates.append("gemini/gemini-1.5-flash")
for model in candidates:
try:
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": blocks}],
max_tokens=512,
)
description = (response.choices[0].message.content or "").strip()
if description:
count = len(image_content)
label = "image" if count == 1 else f"{count} images"
return f"[{label} attached — description: {description}]", model
except Exception as exc:
logger.debug("Vision fallback model '%s' failed: %s", model, exc)
continue
return None
intent = "Describe the attached image(s) so a text-only agent can understand them."
result = await _captioning_chain(intent, image_content)
if not result:
return None
description, model = result
label = "image" if len(image_content) == 1 else f"{len(image_content)} images"
return f"[{label} attached — description: {description}]", model
def _vision_fallback_active(model: str | None) -> bool:
@@ -253,21 +222,22 @@ async def _captioning_chain(
intent: str,
image_content: list[dict[str, Any]],
) -> tuple[str, str] | None:
"""Two-stage caption chain used by the agent-loop tool-result hook.
"""Configured vision_fallback → retry → ``gemini/gemini-3-flash-preview``.
Stage 1: configured ``vision_fallback`` model with intent + images.
Stage 2: generic-caption rotation (gpt-4o-mini claude-3-haiku
gemini-flash) when stage 1 is unconfigured or fails.
Returns ``(caption, model)`` the caption text paired with the
model id that produced it or ``None`` if both stages fail.
Caller is responsible for the placeholder-on-None and the splice
into the persisted tool-result content.
The Gemini override reuses the configured ``api_key`` / ``api_base``,
so a Hive subscriber (whose token routes to a multi-model proxy)
keeps coverage when their primary model glitches. Without
configured creds litellm falls through to env-based Gemini auth;
users with neither Hive nor a ``GEMINI_API_KEY`` simply lose the
third try.
"""
result = await caption_tool_image(intent, image_content)
if not result:
result = await _describe_images_as_text(image_content)
return result
if result := await caption_tool_image(intent, image_content):
return result
logger.warning("vision_fallback failed; retrying configured model")
if result := await caption_tool_image(intent, image_content):
return result
logger.warning("vision_fallback retry failed; trying gemini-3-flash-preview")
return await caption_tool_image(intent, image_content, model_override="gemini/gemini-3-flash-preview")
# Pattern for detecting context-window-exceeded errors across LLM providers.
@@ -18,9 +18,10 @@ This module provides:
Both helpers degrade silently return ``None`` / a placeholder rather
than raise so a vision-fallback failure can never kill the main
agent's run. The agent-loop call site is responsible for chaining
through to the existing generic-caption rotation
(``_describe_images_as_text``) on a None return.
agent's run. The agent-loop call site retries the configured model
once on a None return, then falls back to
``gemini/gemini-3-flash-preview`` via the ``model_override`` parameter
of :func:`caption_tool_image`.
"""
from __future__ import annotations
@@ -156,25 +157,30 @@ async def caption_tool_image(
image_content: list[dict[str, Any]],
*,
timeout_s: float = 30.0,
model_override: str | None = None,
) -> tuple[str, str] | None:
"""Caption the given images using the configured ``vision_fallback`` model.
Returns ``(caption, model)`` on success the model's text response
paired with the model id that produced it or ``None`` on any
failure (no config, no API key, timeout, exception, empty
response). Callers chain to the next stage of the fallback on None.
Returns ``(caption, model)`` on success or ``None`` on any failure
(no config, no API key, timeout, exception, empty response).
Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn`` so the
cost / latency / quality are auditable post-hoc, tagged with
``execution_id="vision_fallback_subagent"``.
``model_override`` swaps in a different litellm model id while
keeping the configured ``vision_fallback`` ``api_key`` / ``api_base``
untouched. That's deliberate: Hive subscribers configure
``vision_fallback`` to point at the Hive proxy, which routes to
multiple models including Gemini so reusing the credentials lets
a Gemini-3-flash override still work without a separate
``GEMINI_API_KEY``. When no creds are configured, litellm falls
back to env-var resolution.
Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn``.
"""
model = get_vision_fallback_model()
model = model_override or get_vision_fallback_model()
if not model:
return None
api_key = get_vision_fallback_api_key()
api_base = get_vision_fallback_api_base()
if not api_key:
if not api_key and not model_override:
logger.debug("vision_fallback configured but no API key resolved; skipping")
return None
@@ -195,8 +201,9 @@ async def caption_tool_image(
"messages": messages,
"max_tokens": 1024,
"timeout": timeout_s,
"api_key": api_key,
}
if api_key:
kwargs["api_key"] = api_key
if api_base:
kwargs["api_base"] = api_base
+4 -3
View File
@@ -162,9 +162,10 @@ def get_vision_fallback_model() -> str | None:
Used by the agent-loop hook that captions tool-result images when the
main agent's model cannot accept image content (text-only LLMs).
When this returns None the fallback chain skips the configured-subagent
stage and proceeds straight to the generic caption rotation
(``_describe_images_as_text``).
When this returns None the captioning chain's configured + retry
attempts both no-op (returning None), and only the final
``gemini/gemini-3-flash-preview`` override has a chance to succeed
and only if a ``GEMINI_API_KEY`` is set in the environment.
"""
vision = get_hive_config().get("vision_fallback", {})
if vision.get("provider") and vision.get("model"):