feat: vision model retry and fallback

2026-04-30 12:38:30 -07:00
parent 628ce9ca12
commit a0817fcde4
3 changed files with 51 additions and 73 deletions
@@ -14,7 +14,6 @@ from __future__ import annotations
 import asyncio
 import json
 import logging
 import os
 import re
 import time
 import uuid
@@ -183,50 +182,20 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str:
 async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None:
-    """Describe images using the best available vision model.
+    """Describe images for the injection-queue drain (no preceding tool call).
-    Returns ``(description, model)`` on success — the formatted
+    Wraps :func:`_captioning_chain` with a generic intent and returns
-    placeholder text plus the model id that produced it — or ``None``
+    the caption inside an ``[image attached — description: …]`` envelope
-    when every candidate fails or no API key is configured.
+    so the injected text reads as image content rather than free-form
    prose.
    """
-    import litellm
+    intent = "Describe the attached image(s) so a text-only agent can understand them."
-
+    result = await _captioning_chain(intent, image_content)
-    blocks: list[dict[str, Any]] = [
+    if not result:
-        {
+        return None
-            "type": "text",
+    description, model = result
-            "text": (
+    label = "image" if len(image_content) == 1 else f"{len(image_content)} images"
-                "Describe the following image(s) concisely but with enough detail "
+    return f"[{label} attached  — description: {description}]", model
                "that a text-only AI assistant can understand the content and context."
            ),
        }
    ]
    blocks.extend(image_content)
    candidates: list[str] = []
    if os.environ.get("OPENAI_API_KEY"):
        candidates.append("gpt-4o-mini")
    if os.environ.get("ANTHROPIC_API_KEY"):
        candidates.append("claude-3-haiku-20240307")
    if os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY"):
        candidates.append("gemini/gemini-1.5-flash")
    for model in candidates:
        try:
            response = await litellm.acompletion(
                model=model,
                messages=[{"role": "user", "content": blocks}],
                max_tokens=512,
            )
            description = (response.choices[0].message.content or "").strip()
            if description:
                count = len(image_content)
                label = "image" if count == 1 else f"{count} images"
                return f"[{label} attached  — description: {description}]", model
        except Exception as exc:
            logger.debug("Vision fallback model '%s' failed: %s", model, exc)
            continue
    return None
 def _vision_fallback_active(model: str | None) -> bool:
@@ -253,21 +222,22 @@ async def _captioning_chain(
    intent: str,
    image_content: list[dict[str, Any]],
 ) -> tuple[str, str] | None:
-    """Two-stage caption chain used by the agent-loop tool-result hook.
+    """Configured vision_fallback → retry → ``gemini/gemini-3-flash-preview``.
-    Stage 1: configured ``vision_fallback`` model with intent + images.
+    The Gemini override reuses the configured ``api_key`` / ``api_base``,
-    Stage 2: generic-caption rotation (gpt-4o-mini → claude-3-haiku
+    so a Hive subscriber (whose token routes to a multi-model proxy)
-    → gemini-flash) when stage 1 is unconfigured or fails.
+    keeps coverage when their primary model glitches. Without
-
+    configured creds litellm falls through to env-based Gemini auth;
-    Returns ``(caption, model)`` — the caption text paired with the
+    users with neither Hive nor a ``GEMINI_API_KEY`` simply lose the
-    model id that produced it — or ``None`` if both stages fail.
+    third try.
    Caller is responsible for the placeholder-on-None and the splice
    into the persisted tool-result content.
    """
-    result = await caption_tool_image(intent, image_content)
+    if result := await caption_tool_image(intent, image_content):
-    if not result:
+        return result
-        result = await _describe_images_as_text(image_content)
+    logger.warning("vision_fallback failed; retrying configured model")
-    return result
+    if result := await caption_tool_image(intent, image_content):
        return result
    logger.warning("vision_fallback retry failed; trying gemini-3-flash-preview")
    return await caption_tool_image(intent, image_content, model_override="gemini/gemini-3-flash-preview")
 # Pattern for detecting context-window-exceeded errors across LLM providers.
@@ -18,9 +18,10 @@ This module provides:
 Both helpers degrade silently — return ``None`` / a placeholder rather
 than raise — so a vision-fallback failure can never kill the main
-agent's run. The agent-loop call site is responsible for chaining
+agent's run. The agent-loop call site retries the configured model
-through to the existing generic-caption rotation
+once on a None return, then falls back to
-(``_describe_images_as_text``) on a None return.
+``gemini/gemini-3-flash-preview`` via the ``model_override`` parameter
 of :func:`caption_tool_image`.
 """
 from __future__ import annotations
@@ -156,25 +157,30 @@ async def caption_tool_image(
    image_content: list[dict[str, Any]],
    *,
    timeout_s: float = 30.0,
    model_override: str | None = None,
 ) -> tuple[str, str] | None:
    """Caption the given images using the configured ``vision_fallback`` model.
-    Returns ``(caption, model)`` on success — the model's text response
+    Returns ``(caption, model)`` on success or ``None`` on any failure
-    paired with the model id that produced it — or ``None`` on any
+    (no config, no API key, timeout, exception, empty response).
    failure (no config, no API key, timeout, exception, empty
    response). Callers chain to the next stage of the fallback on None.
-    Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn`` so the
+    ``model_override`` swaps in a different litellm model id while
-    cost / latency / quality are auditable post-hoc, tagged with
+    keeping the configured ``vision_fallback`` ``api_key`` / ``api_base``
-    ``execution_id="vision_fallback_subagent"``.
+    untouched. That's deliberate: Hive subscribers configure
    ``vision_fallback`` to point at the Hive proxy, which routes to
    multiple models including Gemini — so reusing the credentials lets
    a Gemini-3-flash override still work without a separate
    ``GEMINI_API_KEY``. When no creds are configured, litellm falls
    back to env-var resolution.
    Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn``.
    """
-    model = get_vision_fallback_model()
+    model = model_override or get_vision_fallback_model()
    if not model:
        return None
    api_key = get_vision_fallback_api_key()
    api_base = get_vision_fallback_api_base()
-    if not api_key:
+    if not api_key and not model_override:
        logger.debug("vision_fallback configured but no API key resolved; skipping")
        return None
@@ -195,8 +201,9 @@ async def caption_tool_image(
        "messages": messages,
        "max_tokens": 1024,
        "timeout": timeout_s,
        "api_key": api_key,
    }
    if api_key:
        kwargs["api_key"] = api_key
    if api_base:
        kwargs["api_base"] = api_base
@@ -162,9 +162,10 @@ def get_vision_fallback_model() -> str | None:
    Used by the agent-loop hook that captions tool-result images when the
    main agent's model cannot accept image content (text-only LLMs).
-    When this returns None the fallback chain skips the configured-subagent
+    When this returns None the captioning chain's configured + retry
-    stage and proceeds straight to the generic caption rotation
+    attempts both no-op (returning None), and only the final
-    (``_describe_images_as_text``).
+    ``gemini/gemini-3-flash-preview`` override has a chance to succeed
    — and only if a ``GEMINI_API_KEY`` is set in the environment.
    """
    vision = get_hive_config().get("vision_fallback", {})
    if vision.get("provider") and vision.get("model"):