feat: vision model retry and fallback

2026-04-30 12:38:30 -07:00
parent 628ce9ca12
commit a0817fcde4
3 changed files with 51 additions and 73 deletions
@@ -14,7 +14,6 @@ from __future__ import annotations
 import asyncio
 import json
 import logging
-import os
 import re
 import time
 import uuid
@@ -183,50 +182,20 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str:


 async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None:
-    """Describe images using the best available vision model.
+    """Describe images for the injection-queue drain (no preceding tool call).

-    Returns ``(description, model)`` on success — the formatted
-    placeholder text plus the model id that produced it — or ``None``
-    when every candidate fails or no API key is configured.
+    Wraps :func:`_captioning_chain` with a generic intent and returns
+    the caption inside an ``[image attached — description: …]`` envelope
+    so the injected text reads as image content rather than free-form
+    prose.
    """
-    import litellm
-
-    blocks: list[dict[str, Any]] = [
-        {
-            "type": "text",
-            "text": (
-                "Describe the following image(s) concisely but with enough detail "
-                "that a text-only AI assistant can understand the content and context."
-            ),
-        }
-    ]
-    blocks.extend(image_content)
-
-    candidates: list[str] = []
-    if os.environ.get("OPENAI_API_KEY"):
-        candidates.append("gpt-4o-mini")
-    if os.environ.get("ANTHROPIC_API_KEY"):
-        candidates.append("claude-3-haiku-20240307")
-    if os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY"):
-        candidates.append("gemini/gemini-1.5-flash")
-
-    for model in candidates:
-        try:
-            response = await litellm.acompletion(
-                model=model,
-                messages=[{"role": "user", "content": blocks}],
-                max_tokens=512,
-            )
-            description = (response.choices[0].message.content or "").strip()
-            if description:
-                count = len(image_content)
-                label = "image" if count == 1 else f"{count} images"
-                return f"[{label} attached  — description: {description}]", model
-        except Exception as exc:
-            logger.debug("Vision fallback model '%s' failed: %s", model, exc)
-            continue
-
-    return None
+    intent = "Describe the attached image(s) so a text-only agent can understand them."
+    result = await _captioning_chain(intent, image_content)
+    if not result:
+        return None
+    description, model = result
+    label = "image" if len(image_content) == 1 else f"{len(image_content)} images"
+    return f"[{label} attached  — description: {description}]", model


 def _vision_fallback_active(model: str | None) -> bool:
@@ -253,21 +222,22 @@ async def _captioning_chain(
    intent: str,
    image_content: list[dict[str, Any]],
 ) -> tuple[str, str] | None:
-    """Two-stage caption chain used by the agent-loop tool-result hook.
+    """Configured vision_fallback → retry → ``gemini/gemini-3-flash-preview``.

-    Stage 1: configured ``vision_fallback`` model with intent + images.
-    Stage 2: generic-caption rotation (gpt-4o-mini → claude-3-haiku
-    → gemini-flash) when stage 1 is unconfigured or fails.
-
-    Returns ``(caption, model)`` — the caption text paired with the
-    model id that produced it — or ``None`` if both stages fail.
-    Caller is responsible for the placeholder-on-None and the splice
-    into the persisted tool-result content.
+    The Gemini override reuses the configured ``api_key`` / ``api_base``,
+    so a Hive subscriber (whose token routes to a multi-model proxy)
+    keeps coverage when their primary model glitches. Without
+    configured creds litellm falls through to env-based Gemini auth;
+    users with neither Hive nor a ``GEMINI_API_KEY`` simply lose the
+    third try.
    """
-    result = await caption_tool_image(intent, image_content)
-    if not result:
-        result = await _describe_images_as_text(image_content)
-    return result
+    if result := await caption_tool_image(intent, image_content):
+        return result
+    logger.warning("vision_fallback failed; retrying configured model")
+    if result := await caption_tool_image(intent, image_content):
+        return result
+    logger.warning("vision_fallback retry failed; trying gemini-3-flash-preview")
+    return await caption_tool_image(intent, image_content, model_override="gemini/gemini-3-flash-preview")


 # Pattern for detecting context-window-exceeded errors across LLM providers.
@@ -18,9 +18,10 @@ This module provides:

 Both helpers degrade silently — return ``None`` / a placeholder rather
 than raise — so a vision-fallback failure can never kill the main
-agent's run. The agent-loop call site is responsible for chaining
-through to the existing generic-caption rotation
-(``_describe_images_as_text``) on a None return.
+agent's run. The agent-loop call site retries the configured model
+once on a None return, then falls back to
+``gemini/gemini-3-flash-preview`` via the ``model_override`` parameter
+of :func:`caption_tool_image`.
 """

 from __future__ import annotations
@@ -156,25 +157,30 @@ async def caption_tool_image(
    image_content: list[dict[str, Any]],
    *,
    timeout_s: float = 30.0,
+    model_override: str | None = None,
 ) -> tuple[str, str] | None:
    """Caption the given images using the configured ``vision_fallback`` model.

-    Returns ``(caption, model)`` on success — the model's text response
-    paired with the model id that produced it — or ``None`` on any
-    failure (no config, no API key, timeout, exception, empty
-    response). Callers chain to the next stage of the fallback on None.
+    Returns ``(caption, model)`` on success or ``None`` on any failure
+    (no config, no API key, timeout, exception, empty response).

-    Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn`` so the
-    cost / latency / quality are auditable post-hoc, tagged with
-    ``execution_id="vision_fallback_subagent"``.
+    ``model_override`` swaps in a different litellm model id while
+    keeping the configured ``vision_fallback`` ``api_key`` / ``api_base``
+    untouched. That's deliberate: Hive subscribers configure
+    ``vision_fallback`` to point at the Hive proxy, which routes to
+    multiple models including Gemini — so reusing the credentials lets
+    a Gemini-3-flash override still work without a separate
+    ``GEMINI_API_KEY``. When no creds are configured, litellm falls
+    back to env-var resolution.
+
+    Logs each call to ``~/.hive/llm_logs`` via ``log_llm_turn``.
    """
-    model = get_vision_fallback_model()
+    model = model_override or get_vision_fallback_model()
    if not model:
        return None
-
    api_key = get_vision_fallback_api_key()
    api_base = get_vision_fallback_api_base()
-    if not api_key:
+    if not api_key and not model_override:
        logger.debug("vision_fallback configured but no API key resolved; skipping")
        return None

@@ -195,8 +201,9 @@ async def caption_tool_image(
        "messages": messages,
        "max_tokens": 1024,
        "timeout": timeout_s,
-        "api_key": api_key,
    }
+    if api_key:
+        kwargs["api_key"] = api_key
    if api_base:
        kwargs["api_base"] = api_base

@@ -162,9 +162,10 @@ def get_vision_fallback_model() -> str | None:
    Used by the agent-loop hook that captions tool-result images when the
    main agent's model cannot accept image content (text-only LLMs).

-    When this returns None the fallback chain skips the configured-subagent
-    stage and proceeds straight to the generic caption rotation
-    (``_describe_images_as_text``).
+    When this returns None the captioning chain's configured + retry
+    attempts both no-op (returning None), and only the final
+    ``gemini/gemini-3-flash-preview`` override has a chance to succeed
+    — and only if a ``GEMINI_API_KEY`` is set in the environment.
    """
    vision = get_hive_config().get("vision_fallback", {})
    if vision.get("provider") and vision.get("model"):