chore: log the vision fallback model usage

2026-04-29 13:04:48 -07:00
parent 5492366c31
commit 4794c8b816
3 changed files with 37 additions and 21 deletions
@@ -182,8 +182,13 @@ def _strip_internal_tags_from_snapshot(snapshot: str) -> str:
    return cleaned


-async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> str | None:
-    """Describe images using the best available vision model."""
+async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> tuple[str, str] | None:
+    """Describe images using the best available vision model.
+
+    Returns ``(description, model)`` on success — the formatted
+    placeholder text plus the model id that produced it — or ``None``
+    when every candidate fails or no API key is configured.
+    """
    import litellm

    blocks: list[dict[str, Any]] = [
@@ -216,7 +221,7 @@ async def _describe_images_as_text(image_content: list[dict[str, Any]]) -> str |
            if description:
                count = len(image_content)
                label = "image" if count == 1 else f"{count} images"
-                return f"[{label} attached  — description: {description}]"
+                return f"[{label} attached  — description: {description}]", model
        except Exception as exc:
            logger.debug("Vision fallback model '%s' failed: %s", model, exc)
            continue
@@ -247,21 +252,22 @@ def _vision_fallback_active(model: str | None) -> bool:
 async def _captioning_chain(
    intent: str,
    image_content: list[dict[str, Any]],
-) -> str | None:
+) -> tuple[str, str] | None:
    """Two-stage caption chain used by the agent-loop tool-result hook.

    Stage 1: configured ``vision_fallback`` model with intent + images.
    Stage 2: generic-caption rotation (gpt-4o-mini → claude-3-haiku
    → gemini-flash) when stage 1 is unconfigured or fails.

-    Returns the caption text or None if both stages fail. Caller is
-    responsible for the placeholder-on-None and the splice into the
-    persisted tool-result content.
+    Returns ``(caption, model)`` — the caption text paired with the
+    model id that produced it — or ``None`` if both stages fail.
+    Caller is responsible for the placeholder-on-None and the splice
+    into the persisted tool-result content.
    """
-    caption = await caption_tool_image(intent, image_content)
-    if not caption:
-        caption = await _describe_images_as_text(image_content)
-    return caption
+    result = await caption_tool_image(intent, image_content)
+    if not result:
+        result = await _describe_images_as_text(image_content)
+    return result


 # Pattern for detecting context-window-exceeded errors across LLM providers.
@@ -3499,14 +3505,17 @@ class AgentLoop(AgentProtocol):
                # this tool).
                vision_fallback_marker: str | None = None
                if image_content and tc.tool_use_id in caption_tasks:
-                    caption = await caption_tasks.pop(tc.tool_use_id)
-                    if caption:
+                    caption_result = await caption_tasks.pop(tc.tool_use_id)
+                    if caption_result:
+                        caption, vision_model = caption_result
                        vision_fallback_marker = f"[vision-fallback caption]\n{caption}"
                        logger.info(
-                            "vision_fallback: captioned %d image(s) for tool '%s' (model '%s' routed through fallback)",
+                            "vision_fallback: captioned %d image(s) for tool '%s' "
+                            "(main model '%s' routed through fallback model '%s')",
                            len(image_content),
                            tc.tool_name,
                            ctx.llm.model if ctx.llm else "?",
+                            vision_model,
                        )
                    else:
                        vision_fallback_marker = "[image stripped — vision fallback exhausted]"
@@ -162,7 +162,7 @@ async def drain_injection_queue(
    conversation: NodeConversation,
    *,
    ctx: NodeContext,
-    describe_images_as_text_fn: (Callable[[list[dict[str, Any]]], Awaitable[str | None]] | None) = None,
+    describe_images_as_text_fn: (Callable[[list[dict[str, Any]]], Awaitable[tuple[str, str] | None]] | None) = None,
 ) -> int:
    """Drain all pending injected events as user messages. Returns count."""
    count = 0
@@ -185,10 +185,14 @@ async def drain_injection_queue(
                    ctx.llm.model,
                )
                if describe_images_as_text_fn is not None:
-                    description = await describe_images_as_text_fn(image_content)
-                    if description:
+                    described = await describe_images_as_text_fn(image_content)
+                    if described:
+                        description, vision_model = described
                        content = f"{content}\n\n{description}" if content else description
-                        logger.info("[drain] image described as text via vision fallback")
+                        logger.info(
+                            "[drain] image described as text via vision fallback (model '%s')",
+                            vision_model,
+                        )
                    else:
                        logger.info("[drain] no vision fallback available; images dropped")
                image_content = None
@@ -156,10 +156,11 @@ async def caption_tool_image(
    image_content: list[dict[str, Any]],
    *,
    timeout_s: float = 30.0,
-) -> str | None:
+) -> tuple[str, str] | None:
    """Caption the given images using the configured ``vision_fallback`` model.

-    Returns the model's text response on success, or ``None`` on any
+    Returns ``(caption, model)`` on success — the model's text response
+    paired with the model id that produced it — or ``None`` on any
    failure (no config, no API key, timeout, exception, empty
    response). Callers chain to the next stage of the fallback on None.

@@ -241,7 +242,9 @@ async def caption_tool_image(
    except Exception:
        pass

-    return caption
+    if caption is None:
+        return None
+    return caption, model


 __all__ = ["caption_tool_image", "extract_intent_for_tool"]