feat: vision fallback with intent

2026-04-30 13:02:57 -07:00
parent a0817fcde4
commit 73511a3c59
4 changed files with 134 additions and 30 deletions
@@ -181,23 +181,6 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str:
    return cleaned
 async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None:
    """Describe images for the injection-queue drain (no preceding tool call).
    Wraps :func:`_captioning_chain` with a generic intent and returns
    the caption inside an ``[image attached — description: …]`` envelope
    so the injected text reads as image content rather than free-form
    prose.
    """
    intent = "Describe the attached image(s) so a text-only agent can understand them."
    result = await _captioning_chain(intent, image_content)
    if not result:
        return None
    description, model = result
    label = "image" if len(image_content) == 1 else f"{len(image_content)} images"
    return f"[{label} attached  — description: {description}]", model
 def _vision_fallback_active(model: str | None) -> bool:
    """Return True if tool-result images for *model* should be routed
    through the vision-fallback chain rather than sent to the model.
@@ -3435,7 +3418,7 @@ class AgentLoop(AgentProtocol):
            # single image needs captioning, this collapses to a
            # single await with no overhead.
            _model_text_only = ctx.llm and _vision_fallback_active(ctx.llm.model)
-            caption_tasks: dict[str, asyncio.Task[str | None]] = {}
+            caption_tasks: dict[str, asyncio.Task[tuple[str, str] | None]] = {}
            if _model_text_only:
                for tc in tool_calls[:executed_in_batch]:
                    res = results_by_id.get(tc.tool_use_id)
@@ -4139,7 +4122,7 @@ class AgentLoop(AgentProtocol):
            queue=self._injection_queue,
            conversation=conversation,
            ctx=ctx,
-            describe_images_as_text_fn=_describe_images_as_text,
+            caption_image_fn=_captioning_chain,
        )
    async def _drain_trigger_queue(self, conversation: NodeConversation) -> int:
@@ -162,9 +162,20 @@ async def drain_injection_queue(
    conversation: NodeConversation,
    *,
    ctx: NodeContext,
-    describe_images_as_text_fn: (Callable[[list[dict[str, Any]]], Awaitable[tuple[str, str] | None]] | None) = None,
+    caption_image_fn: (
        Callable[[str, list[dict[str, Any]]], Awaitable[tuple[str, str] | None]] | None
    ) = None,
 ) -> int:
-    """Drain all pending injected events as user messages. Returns count."""
+    """Drain all pending injected events as user messages. Returns count.
    ``caption_image_fn`` is the unified vision fallback hook. It takes
    ``(intent, image_content)`` and returns ``(caption, model)`` on
    success — the model id is logged so the destination is observable.
    The user's typed ``content`` (the injected message body) is passed
    as the intent so the captioner can answer the user's specific
    question about the image rather than producing a generic
    description; an empty content falls back to a generic intent.
    """
    count = 0
    logger.debug(
        "[drain_injection_queue] Starting to drain queue, initial queue size: %s",
@@ -184,10 +195,13 @@ async def drain_injection_queue(
                    "Model '%s' does not support images; attempting vision fallback",
                    ctx.llm.model,
                )
-                if describe_images_as_text_fn is not None:
+                if caption_image_fn is not None:
-                    described = await describe_images_as_text_fn(image_content)
+                    intent = content or (
-                    if described:
+                        "Describe these user-injected images for a text-only agent."
-                        description, vision_model = described
+                    )
                    caption_result = await caption_image_fn(intent, image_content)
                    if caption_result:
                        description, vision_model = caption_result
                        content = f"{content}\n\n{description}" if content else description
                        logger.info(
                            "[drain] image described as text via vision fallback (model '%s')",
@@ -196,16 +196,53 @@ async def caption_tool_image(
        {"role": "user", "content": user_blocks},
    ]
    # Apply the same proxy rewrites the main LLM provider uses so a
    # `hive/...` / `kimi/...` model resolves to the right Anthropic-
    # compatible endpoint with the right auth header. Without this,
    # litellm doesn't know what `hive/kimi-k2.5` is and rejects the call
    # with "LLM Provider NOT provided."
    from framework.llm.litellm import rewrite_proxy_model
    rewritten_model, rewritten_base, extra_headers = rewrite_proxy_model(
        model, api_key, api_base
    )
    kwargs: dict[str, Any] = {
-        "model": model,
+        "model": rewritten_model,
        "messages": messages,
        "max_tokens": 1024,
        "timeout": timeout_s,
    }
-    if api_key:
+    # Pass api_key directly only when there are no proxy-rewritten
    # extra_headers carrying the auth (e.g. the gemini-3-flash override
    # path goes direct to Gemini, not through the Hive proxy).
    if api_key and not extra_headers:
        kwargs["api_key"] = api_key
-    if api_base:
+    if rewritten_base:
-        kwargs["api_base"] = api_base
+        kwargs["api_base"] = rewritten_base
    if extra_headers:
        kwargs["extra_headers"] = extra_headers
    # Surface where the request is going so the user can verify the
    # vision fallback is hitting the expected proxy / model. Redacts
    # the API key to a length+head+tail digest so it can be cross-
    # correlated with other auth-related log lines.
    key_digest = (
        f"len={len(api_key)} {api_key[:8]}…{api_key[-4:]}"
        if api_key and len(api_key) >= 12
        else f"len={len(api_key) if api_key else 0}"
    )
    logger.info(
        "[vision_fallback] dispatching: configured_model=%s rewritten_model=%s "
        "api_base=%s api_key=%s images=%d intent_chars=%d timeout_s=%.1f",
        model,
        rewritten_model,
        rewritten_base or "<litellm-default>",
        key_digest,
        len(image_content),
        len(intent),
        timeout_s,
    )
    started = datetime.now()
    caption: str | None = None
@@ -215,9 +252,21 @@ async def caption_tool_image(
        text = (response.choices[0].message.content or "").strip()
        if text:
            caption = text
        logger.info(
            "[vision_fallback] response: model=%s api_base=%s elapsed_s=%.2f chars=%d",
            rewritten_model,
            rewritten_base or "<litellm-default>",
            (datetime.now() - started).total_seconds(),
            len(text),
        )
    except Exception as exc:
        error_text = f"{type(exc).__name__}: {exc}"
-        logger.debug("vision_fallback model '%s' failed: %s", model, exc)
+        logger.warning(
            "[vision_fallback] failed: model=%s api_base=%s error=%s",
            rewritten_model,
            rewritten_base or "<litellm-default>",
            error_text,
        )
    # Best-effort audit log so users can grep ~/.hive/llm_logs/ for
    # vision-fallback subagent calls. Failures here must not bubble.
@@ -211,6 +211,44 @@ def _ensure_ollama_chat_prefix(model: str) -> str:
    return model
 def rewrite_proxy_model(
    model: str, api_key: str | None, api_base: str | None
 ) -> tuple[str, str | None, dict[str, str]]:
    """Apply Hive/Kimi proxy rewrites for any caller of ``litellm.acompletion``.
    Both the Hive LLM proxy and Kimi For Coding expose Anthropic-API-
    compatible endpoints. LiteLLM doesn't recognise the ``hive/`` or
    ``kimi/`` prefixes natively, so we rewrite them to ``anthropic/``
    here. For the Hive proxy we also stamp a Bearer token into
    ``extra_headers`` because litellm's Anthropic handler only sends
    ``x-api-key`` and the proxy expects ``Authorization: Bearer``.
    Used by ad-hoc ``litellm.acompletion`` callers (e.g. the vision-
    fallback subagent in ``caption_tool_image``) so they hit the same
    proxy with the same auth as the main agent's ``LiteLLMProvider``.
    The provider's own ``__init__`` keeps its inlined rewrite for now —
    this helper is the single source of truth for ad-hoc callers.
    Returns: (rewritten_model, normalised_api_base, extra_headers).
    The ``extra_headers`` dict is non-empty only for the Hive proxy
    (and only when ``api_key`` is provided).
    """
    extra_headers: dict[str, str] = {}
    if model.lower().startswith("kimi/"):
        model = "anthropic/" + model[len("kimi/") :]
        if api_base and api_base.rstrip("/").endswith("/v1"):
            api_base = api_base.rstrip("/")[:-3]
    elif model.lower().startswith("hive/"):
        model = "anthropic/" + model[len("hive/") :]
        if api_base and api_base.rstrip("/").endswith("/v1"):
            api_base = api_base.rstrip("/")[:-3]
        # Hive proxy expects Bearer auth; litellm's Anthropic handler
        # only sends x-api-key without this nudge.
        if api_key:
            extra_headers["Authorization"] = f"Bearer {api_key}"
    return model, api_base, extra_headers
 RATE_LIMIT_MAX_RETRIES = 10
 RATE_LIMIT_BACKOFF_BASE = 2  # seconds
 RATE_LIMIT_MAX_DELAY = 120  # seconds - cap to prevent absurd waits
@@ -963,6 +1001,7 @@ class LiteLLMProvider(LLMProvider):
        # Translate kimi/ prefix to anthropic/ so litellm uses the Anthropic
        # Messages API handler and routes to that endpoint — no special headers needed.
        _original_model = model
        self._hive_proxy_auth = bool(_original_model.lower().startswith("hive/"))
        if _is_ollama_model(model):
            model = _ensure_ollama_chat_prefix(model)
        elif model.lower().startswith("kimi/"):
@@ -1016,6 +1055,7 @@ class LiteLLMProvider(LLMProvider):
        these attributes in-place propagates to all callers on the next LLM call.
        """
        _original_model = model
        self._hive_proxy_auth = bool(_original_model.lower().startswith("hive/"))
        if _is_ollama_model(model):
            model = _ensure_ollama_chat_prefix(model)
        elif model.lower().startswith("kimi/"):
@@ -1255,6 +1295,16 @@ class LiteLLMProvider(LLMProvider):
                # Ollama requires explicit tool_choice=auto for function calling
                # so future readers don't have to guess.
                kwargs.setdefault("tool_choice", "auto")
            elif self._hive_proxy_auth:
                # The Hive LLM proxy fronts GLM, which drifts into "explain
                # the plan" mode on long-context turns instead of emitting
                # tool_use blocks (verified 2026-04-28: tool_choice=null →
                # text-only stop=stop; tool_choice=required → clean
                # tool_use). Force a tool call when tools are available
                # so queens can't get stuck in chat mode. Callers that
                # legitimately want a non-tool turn can override via
                # extra_kwargs.
                kwargs.setdefault("tool_choice", "required")
        # Add response_format for structured output
        # LiteLLM passes this through to the underlying provider
@@ -1492,6 +1542,10 @@ class LiteLLMProvider(LLMProvider):
                # Ollama requires explicit tool_choice=auto for function calling
                # so future readers don't have to guess.
                kwargs.setdefault("tool_choice", "auto")
            elif self._hive_proxy_auth:
                # See `complete()` for the rationale: GLM behind the Hive
                # proxy needs forcing or it goes chat-mode on long contexts.
                kwargs.setdefault("tool_choice", "required")
        if response_format:
            kwargs["response_format"] = response_format
@@ -2276,6 +2330,10 @@ class LiteLLMProvider(LLMProvider):
                # Ollama requires explicit tool_choice=auto for function calling
                # so future readers don't have to guess.
                kwargs.setdefault("tool_choice", "auto")
            elif self._hive_proxy_auth:
                # See `complete()` for the rationale: GLM behind the Hive
                # proxy needs forcing or it goes chat-mode on long contexts.
                kwargs.setdefault("tool_choice", "required")
        if response_format:
            kwargs["response_format"] = response_format
        # The Codex ChatGPT backend (Responses API) rejects several params.